Changes the consumer to work on a temporary copy and provies that copy to the pre...

author Trenton H <797416+stumpylog@users.noreply.github.com>

Sat, 28 Jan 2023 17:32:40 +0000 (09:32 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Sun, 29 Jan 2023 16:37:32 +0000 (08:37 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Sat, 28 Jan 2023 17:32:40 +0000 (09:32 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Sun, 29 Jan 2023 16:37:32 +0000 (08:37 -0800)
diff --git a/src/documents/consumer.py b/src/documents/consumer.py

index bc344abb9d7f8a593f74682570a439e82aec5497..8c80304d3c857db717975a7f7990ddf70a1c939d 100644 (file)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,7 +1,10 @@
  import datetime
  import hashlib
  import os
+import shutil
+import tempfile
  import uuid
+from pathlib import Path
  from subprocess import CompletedProcess
  from subprocess import run
  from typing import Optional
@@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
  
      def __init__(self):
          super().__init__()
-        self.path = None
+        self.path: Optional[Path] = None
+        self.original_path: Optional[Path] = None
          self.filename = None
          self.override_title = None
          self.override_correspondent_id = None
@@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
  
          self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
  
-        filepath_arg = os.path.normpath(self.path)
+        working_file_path = str(self.path)
+        original_file_path = str(self.original_path)
  
          script_env = os.environ.copy()
-        script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+        script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
+        script_env["DOCUMENT_WORKING_PATH"] = working_file_path
  
          try:
              completed_proc = run(
                  args=[
                      settings.PRE_CONSUME_SCRIPT,
-                    filepath_arg,
+                    original_file_path,
                  ],
                  env=script_env,
                  capture_output=True,
@@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
                  exception=e,
              )
  
-    def run_post_consume_script(self, document):
+    def run_post_consume_script(self, document: Document):
          if not settings.POST_CONSUME_SCRIPT:
              return
  
@@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
          Return the document object if it was successfully created.
          """
  
-        self.path = path
-        self.filename = override_filename or os.path.basename(path)
+        self.path = Path(path).resolve()
+        self.filename = override_filename or self.path.name
          self.override_title = override_title
          self.override_correspondent_id = override_correspondent_id
          self.override_document_type_id = override_document_type_id
@@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
  
          self.log("info", f"Consuming {self.filename}")
  
+        # For the actual work, copy the file into a tempdir
+        self.original_path = self.path
+        tempdir = tempfile.TemporaryDirectory(
+            prefix="paperless-ngx",
+            dir=settings.SCRATCH_DIR,
+        )
+        self.path = Path(tempdir.name) / Path(self.filename)
+        shutil.copy(self.original_path, self.path)
+
          # Determine the parser class.
  
          mime_type = magic.from_file(self.path, mime=True)
@@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
                  # Delete the file only if it was successfully consumed
                  self.log("debug", f"Deleting file {self.path}")
                  os.unlink(self.path)
+                self.original_path.unlink()
  
                  # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                  shadow_file = os.path.join(
-                    os.path.dirname(self.path),
-                    "._" + os.path.basename(self.path),
+                    os.path.dirname(self.original_path),
+                    "._" + os.path.basename(self.original_path),
                  )
  
                  if os.path.isfile(shadow_file):
@@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
              )
          finally:
              document_parser.cleanup()
+            tempdir.cleanup()
  
          self.run_post_consume_script(document)
  
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py

index dc86de3316e38638a18420f664ffd1500fa8ede2..de368018f66d6e81b6d4230c07e00afdf7f4191e 100644 (file)
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
          with tempfile.NamedTemporaryFile() as script:
              with override_settings(PRE_CONSUME_SCRIPT=script.name):
                  c = Consumer()
-                c.path = "path-to-file"
+                c.original_path = "path-to-file"
+                c.path = "/tmp/somewhere/path-to-file"
                  c.run_pre_consume_script()
  
                  m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
                  args, kwargs = m.call_args
  
                  command = kwargs["args"]
+                environment = kwargs["env"]
  
                  self.assertEqual(command[0], script.name)
                  self.assertEqual(command[1], "path-to-file")
  
+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_SOURCE_PATH": c.original_path,
+                        "DOCUMENT_WORKING_PATH": c.path,
+                    },
+                    environment,
+                )
+
      @mock.patch("documents.consumer.Consumer.log")
      def test_script_with_output(self, mocked_log):
          """
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
  
                  m.assert_called_once()
  
-                args, kwargs = m.call_args
+                _, kwargs = m.call_args
  
                  command = kwargs["args"]
+                environment = kwargs["env"]
  
                  self.assertEqual(command[0], script.name)
                  self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
                  self.assertEqual(command[7], "my_bank")
                  self.assertCountEqual(command[8].split(","), ["a", "b"])
  
+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_ID": str(doc.pk),
+                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                        "DOCUMENT_CORRESPONDENT": "my_bank",
+                        "DOCUMENT_TAGS": "a,b",
+                    },
+                    environment,
+                )
+
      def test_script_exit_non_zero(self):
          """
          GIVEN:
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Sat, 28 Jan 2023 17:32:40 +0000 (09:32 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Sun, 29 Jan 2023 16:37:32 +0000 (08:37 -0800)
src/documents/consumer.py		patch \| blob \| blame \| history
src/documents/tests/test_consumer.py		patch \| blob \| blame \| history