Fix: Allows pre-consume scripts to modify the working path again (#5260)

author Trenton H <797416+stumpylog@users.noreply.github.com>

Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)

committer GitHub <noreply@github.com>

Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
committer GitHub <noreply@github.com>
Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md

index ca5ed4321886f103646c3ff58f8ca0bbe35214de..4bbd7753deab007371addc21aaba46d4510ce6c5 100644 (file)
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -136,6 +136,11 @@ script can access the following relevant environment variables set:
      be triggered, leading to failures as two tasks work on the
      same document path
  
+!!! warning
+
+    If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic
+    way, this may allow duplicate documents to be stored
+
  A simple but common example for this would be creating a simple script
  like this:
  
diff --git a/src/documents/consumer.py b/src/documents/consumer.py

index 11faeea433c49bab08d4274160dbcd65228ad811..b7a559575948d5d7b971088163ab4238227f92b8 100644 (file)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -135,24 +135,24 @@ class Consumer(LoggingMixin):
          """
          Confirm the input file still exists where it should
          """
-        if not os.path.isfile(self.path):
+        if not os.path.isfile(self.original_path):
              self._fail(
                  ConsumerStatusShortMessage.FILE_NOT_FOUND,
-                f"Cannot consume {self.path}: File not found.",
+                f"Cannot consume {self.original_path}: File not found.",
              )
  
      def pre_check_duplicate(self):
          """
          Using the MD5 of the file, check this exact file doesn't already exist
          """
-        with open(self.path, "rb") as f:
+        with open(self.original_path, "rb") as f:
              checksum = hashlib.md5(f.read()).hexdigest()
          existing_doc = Document.objects.filter(
              Q(checksum=checksum) | Q(archive_checksum=checksum),
          )
          if existing_doc.exists():
              if settings.CONSUMER_DELETE_DUPLICATES:
-                os.unlink(self.path)
+                os.unlink(self.original_path)
              self._fail(
                  ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
                  f"Not consuming {self.filename}: It is a duplicate of"
@@ -211,7 +211,7 @@ class Consumer(LoggingMixin):
  
          self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
  
-        working_file_path = str(self.path)
+        working_file_path = str(self.working_copy)
          original_file_path = str(self.original_path)
  
          script_env = os.environ.copy()
@@ -343,8 +343,8 @@ class Consumer(LoggingMixin):
          Return the document object if it was successfully created.
          """
  
-        self.path = Path(path).resolve()
-        self.filename = override_filename or self.path.name
+        self.original_path = Path(path).resolve()
+        self.filename = override_filename or self.original_path.name
          self.override_title = override_title
          self.override_correspondent_id = override_correspondent_id
          self.override_document_type_id = override_document_type_id
@@ -377,17 +377,16 @@ class Consumer(LoggingMixin):
          self.log.info(f"Consuming {self.filename}")
  
          # For the actual work, copy the file into a tempdir
-        self.original_path = self.path
          tempdir = tempfile.TemporaryDirectory(
              prefix="paperless-ngx",
              dir=settings.SCRATCH_DIR,
          )
-        self.path = Path(tempdir.name) / Path(self.filename)
-        copy_file_with_basic_stats(self.original_path, self.path)
+        self.working_copy = Path(tempdir.name) / Path(self.filename)
+        copy_file_with_basic_stats(self.original_path, self.working_copy)
  
          # Determine the parser class.
  
-        mime_type = magic.from_file(self.path, mime=True)
+        mime_type = magic.from_file(self.working_copy, mime=True)
  
          self.log.debug(f"Detected mime type: {mime_type}")
  
@@ -406,7 +405,7 @@ class Consumer(LoggingMixin):
  
          document_consumption_started.send(
              sender=self.__class__,
-            filename=self.path,
+            filename=self.working_copy,
              logging_group=self.logging_group,
          )
  
@@ -444,7 +443,7 @@ class Consumer(LoggingMixin):
                  ConsumerStatusShortMessage.PARSING_DOCUMENT,
              )
              self.log.debug(f"Parsing {self.filename}...")
-            document_parser.parse(self.path, mime_type, self.filename)
+            document_parser.parse(self.working_copy, mime_type, self.filename)
  
              self.log.debug(f"Generating thumbnail for {self.filename}...")
              self._send_progress(
@@ -454,7 +453,7 @@ class Consumer(LoggingMixin):
                  ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
              )
              thumbnail = document_parser.get_thumbnail(
-                self.path,
+                self.working_copy,
                  mime_type,
                  self.filename,
              )
@@ -527,7 +526,7 @@ class Consumer(LoggingMixin):
  
                      self._write(
                          document.storage_type,
-                        self.original_path,
+                        self.working_copy,
                          document.source_path,
                      )
  
@@ -560,9 +559,9 @@ class Consumer(LoggingMixin):
                  document.save()
  
                  # Delete the file only if it was successfully consumed
-                self.log.debug(f"Deleting file {self.path}")
-                os.unlink(self.path)
+                self.log.debug(f"Deleting file {self.working_copy}")
                  self.original_path.unlink()
+                self.working_copy.unlink()
  
                  # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                  shadow_file = os.path.join(
@@ -735,7 +734,7 @@ class Consumer(LoggingMixin):
              )[:127],
              content=text,
              mime_type=mime_type,
-            checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(),
+            checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(),
              created=create_date,
              modified=create_date,
              storage_type=storage_type,
diff --git a/src/documents/parsers.py b/src/documents/parsers.py

index 3215d49a614006ae0dcc4f964742c48da20e288c..b823ae9ceb44f7f11c051a0fb8919dfab463e623 100644 (file)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin):
          self.logging_group = logging_group
          self.settings = self.get_settings()
          os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+        self.tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
  
          self.archive_path = None
          self.text = None
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py

index 1db90ee5476c1d9d07632104116032bb5a1117bd..24f7ff3388f7544411985f6d932397a846be5b11 100644 (file)
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase):
      @override_settings(PRE_CONSUME_SCRIPT=None)
      def test_no_pre_consume_script(self, m):
          c = Consumer()
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
          c.run_pre_consume_script()
          m.assert_not_called()
  
@@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase):
      def test_pre_consume_script_not_found(self, m, m2):
          c = Consumer()
          c.filename = "somefile.pdf"
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
          self.assertRaises(ConsumerError, c.run_pre_consume_script)
  
      @mock.patch("documents.consumer.run")
@@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase):
              with override_settings(PRE_CONSUME_SCRIPT=script.name):
                  c = Consumer()
                  c.original_path = "path-to-file"
-                c.path = "/tmp/somewhere/path-to-file"
+                c.working_copy = "/tmp/somewhere/path-to-file"
                  c.task_id = str(uuid.uuid4())
                  c.run_pre_consume_script()
  
@@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase):
  
                  subset = {
                      "DOCUMENT_SOURCE_PATH": c.original_path,
-                    "DOCUMENT_WORKING_PATH": c.path,
+                    "DOCUMENT_WORKING_PATH": c.working_copy,
                      "TASK_ID": c.task_id,
                  }
                  self.assertDictEqual(environment, {**environment, **subset})
@@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase):
              with override_settings(PRE_CONSUME_SCRIPT=script.name):
                  with self.assertLogs("paperless.consumer", level="INFO") as cm:
                      c = Consumer()
-                    c.path = "path-to-file"
+                    c.working_copy = "path-to-file"
  
                      c.run_pre_consume_script()
                      self.assertIn(
@@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase):
  
              with override_settings(PRE_CONSUME_SCRIPT=script.name):
                  c = Consumer()
-                c.path = "path-to-file"
+                c.working_copy = "path-to-file"
                  self.assertRaises(
                      ConsumerError,
                      c.run_pre_consume_script,
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 047a171b2f2456c2194f357c5068c4862b573873..c699d8ea56d09ed2dcdeb20afefb69450012b39b 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser):
          with Image.open(image) as im:
              return im.mode in ("RGBA", "LA")
  
-    def remove_alpha(self, image_path: str):
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
          subprocess.run(
              [
                  settings.CONVERT_BINARY,
                  "-alpha",
                  "off",
                  image_path,
-                image_path,
+                no_alpha_image,
              ],
          )
+        return no_alpha_image
  
      def get_dpi(self, image) -> Optional[int]:
          try:
@@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
                      f"Removing alpha layer from {input_file} "
                      "for compatibility with img2pdf",
                  )
-                self.remove_alpha(input_file)
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
  
              if dpi:
                  self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
committer	GitHub <noreply@github.com>
	Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
docs/advanced_usage.md		patch \| blob \| blame \| history
src/documents/consumer.py		patch \| blob \| blame \| history
src/documents/parsers.py		patch \| blob \| blame \| history
src/documents/tests/test_consumer.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history