]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Add a setting to disable creating an archive file 2732/head
authorBrandon Rothweiler <brandonrothweiler@gmail.com>
Wed, 22 Feb 2023 20:27:17 +0000 (15:27 -0500)
committerBrandon Rothweiler <brandonrothweiler@gmail.com>
Wed, 22 Feb 2023 20:27:17 +0000 (15:27 -0500)
docs/configuration.md
src/paperless/checks.py
src/paperless_tesseract/parsers.py
src/paperless_tesseract/tests/test_parser.py

index 6c233c2e60c498cbcc7858e13da929bf57415cda..5cf0022f307a17cf7ea6569260e97f93d7440a5e 100644 (file)
@@ -419,7 +419,10 @@ modes are available:
         an archived version of your documents when it finds any text in
         them. This is useful if you don't want to have two
         almost-identical versions of your digital documents in the media
-        folder. This is the fastest option.
+        folder.
+
+    -   `skip_neverarchive`: In addition to skip, paperless will never
+        create an archive version of your documents. This is the fastest option.
 
     -   `redo`: Paperless will OCR all pages of your documents and
         attempt to replace any existing text layers with new text. This
index 845ff2d0bada8f33e46801d8f3f702623d35c738..53972bc21f032faede87ab0ca16477d5e8a29dbb 100644 (file)
@@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs):
                 Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
             )
 
-        if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
+        if settings.OCR_MODE not in {
+            "force",
+            "skip",
+            "redo",
+            "skip_noarchive",
+            "skip_neverarchive",
+        }:
             msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
 
         if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
index 4227583f8de086a6b3e361ffdd38cb4a6ec8a1fc..3a91e33901d888fe48d4f606e3767c74e674ab59 100644 (file)
@@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
 
         if settings.OCR_MODE == "force" or safe_fallback:
             ocrmypdf_args["force_ocr"] = True
-        elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
+        elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
             ocrmypdf_args["skip_text"] = True
         elif settings.OCR_MODE == "redo":
             ocrmypdf_args["redo_ocr"] = True
@@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser):
 
         # If the original has text, and the user doesn't want an archive,
         # we're done here
-        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+        if (
+            settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
+            and original_has_text
+        ):
             self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
             self.text = text_original
             return
@@ -320,7 +323,9 @@ class RasterisedDocumentParser(DocumentParser):
             self.log("debug", f"Calling OCRmyPDF with args: {args}")
             ocrmypdf.ocr(**args)
 
-            self.archive_path = archive_path
+            # Only create archive file if archiving isn't being skipped
+            if settings.OCR_MODE != "skip_neverarchive":
+                self.archive_path = archive_path
 
             self.text = self.extract_text(sidecar_file, archive_path)
 
index 94b72a0ee80ff4aed875aa9c025c7d7c26b29d79..de0c3ce380ac9c1445e154ea67eb9a8043043c82 100644 (file)
@@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 
         self.assertIsNotNone(parser.archive_path)
 
+    @override_settings(OCR_MODE="skip_neverarchive")
+    def test_skip_neverarchive_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR mode set to skip_neverarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_MODE="skip_neverarchive")
+    def test_skip_neverarchive_notext(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR mode set to skip_neverarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
     @override_settings(OCR_MODE="skip")
     def test_multi_page_mixed(self):
         """