Fixes the creation of an archive file, even if noarchive was specified

author Trenton Holmes <holmes.trenton@gmail.com>

Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)

committer Trenton Holmes <holmes.trenton@gmail.com>

Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)
author Trenton Holmes <holmes.trenton@gmail.com>
Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)
committer Trenton Holmes <holmes.trenton@gmail.com>
Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 1cb79959aa3fc741779f220a9bb536c2ea941814..abb3d3dfecc5dbe0e29739d3c779edbf9443a56b 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
              self.log("debug", f"Calling OCRmyPDF with args: {args}")
              ocrmypdf.ocr(**args)
  
-            self.archive_path = archive_path
+            # Only create archive file if archiving isn't being skipped
+            if settings.OCR_MODE != "skip_noarchive":
+                self.archive_path = archive_path
+
              self.text = self.extract_text(sidecar_file, archive_path)
  
              if not self.text:
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py

index 6bf8bd5f45b6c5cf0a21e8e38c5b7ebd2d18d036..700782a9284e459e7b4d4ddb42bc6b2bffe66096 100644 (file)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):
  
      @override_settings(OCR_MODE="skip_noarchive")
      def test_skip_noarchive_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
          parser = RasterisedDocumentParser(None)
          parser.parse(
              os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):
  
      @override_settings(OCR_MODE="skip_noarchive")
      def test_skip_noarchive_notext(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
          parser = RasterisedDocumentParser(None)
          parser.parse(
              os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
              "application/pdf",
          )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+
          self.assertContainsStrings(
              parser.get_text().lower(),
              ["page 1", "page 2", "page 3"],
          )
  
+        self.assertIsNone(parser.archive_path)
+
      @override_settings(OCR_MODE="skip")
      def test_multi_page_mixed(self):
          parser = RasterisedDocumentParser(None)
@@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):
  
      @override_settings(OCR_MODE="skip_noarchive")
      def test_multi_page_mixed_no_archive(self):
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
          parser = RasterisedDocumentParser(None)
          parser.parse(
              os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
author	Trenton Holmes <holmes.trenton@gmail.com>
	Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)
committer	Trenton Holmes <holmes.trenton@gmail.com>
	Sat, 20 Aug 2022 20:47:56 +0000 (13:47 -0700)
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history
src/paperless_tesseract/tests/test_parser.py		patch \| blob \| blame \| history