]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Changes the consumer to work on a temporary copy and provies that copy to the pre...
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Sat, 28 Jan 2023 17:32:40 +0000 (09:32 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Sun, 29 Jan 2023 16:37:32 +0000 (08:37 -0800)
src/documents/consumer.py
src/documents/tests/test_consumer.py

index bc344abb9d7f8a593f74682570a439e82aec5497..8c80304d3c857db717975a7f7990ddf70a1c939d 100644 (file)
@@ -1,7 +1,10 @@
 import datetime
 import hashlib
 import os
+import shutil
+import tempfile
 import uuid
+from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import Optional
@@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
 
     def __init__(self):
         super().__init__()
-        self.path = None
+        self.path: Optional[Path] = None
+        self.original_path: Optional[Path] = None
         self.filename = None
         self.override_title = None
         self.override_correspondent_id = None
@@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
 
         self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
 
-        filepath_arg = os.path.normpath(self.path)
+        working_file_path = str(self.path)
+        original_file_path = str(self.original_path)
 
         script_env = os.environ.copy()
-        script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+        script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
+        script_env["DOCUMENT_WORKING_PATH"] = working_file_path
 
         try:
             completed_proc = run(
                 args=[
                     settings.PRE_CONSUME_SCRIPT,
-                    filepath_arg,
+                    original_file_path,
                 ],
                 env=script_env,
                 capture_output=True,
@@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
                 exception=e,
             )
 
-    def run_post_consume_script(self, document):
+    def run_post_consume_script(self, document: Document):
         if not settings.POST_CONSUME_SCRIPT:
             return
 
@@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
         Return the document object if it was successfully created.
         """
 
-        self.path = path
-        self.filename = override_filename or os.path.basename(path)
+        self.path = Path(path).resolve()
+        self.filename = override_filename or self.path.name
         self.override_title = override_title
         self.override_correspondent_id = override_correspondent_id
         self.override_document_type_id = override_document_type_id
@@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
 
         self.log("info", f"Consuming {self.filename}")
 
+        # For the actual work, copy the file into a tempdir
+        self.original_path = self.path
+        tempdir = tempfile.TemporaryDirectory(
+            prefix="paperless-ngx",
+            dir=settings.SCRATCH_DIR,
+        )
+        self.path = Path(tempdir.name) / Path(self.filename)
+        shutil.copy(self.original_path, self.path)
+
         # Determine the parser class.
 
         mime_type = magic.from_file(self.path, mime=True)
@@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
                 # Delete the file only if it was successfully consumed
                 self.log("debug", f"Deleting file {self.path}")
                 os.unlink(self.path)
+                self.original_path.unlink()
 
                 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                 shadow_file = os.path.join(
-                    os.path.dirname(self.path),
-                    "._" + os.path.basename(self.path),
+                    os.path.dirname(self.original_path),
+                    "._" + os.path.basename(self.original_path),
                 )
 
                 if os.path.isfile(shadow_file):
@@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
             )
         finally:
             document_parser.cleanup()
+            tempdir.cleanup()
 
         self.run_post_consume_script(document)
 
index dc86de3316e38638a18420f664ffd1500fa8ede2..de368018f66d6e81b6d4230c07e00afdf7f4191e 100644 (file)
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
         with tempfile.NamedTemporaryFile() as script:
             with override_settings(PRE_CONSUME_SCRIPT=script.name):
                 c = Consumer()
-                c.path = "path-to-file"
+                c.original_path = "path-to-file"
+                c.path = "/tmp/somewhere/path-to-file"
                 c.run_pre_consume_script()
 
                 m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
                 args, kwargs = m.call_args
 
                 command = kwargs["args"]
+                environment = kwargs["env"]
 
                 self.assertEqual(command[0], script.name)
                 self.assertEqual(command[1], "path-to-file")
 
+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_SOURCE_PATH": c.original_path,
+                        "DOCUMENT_WORKING_PATH": c.path,
+                    },
+                    environment,
+                )
+
     @mock.patch("documents.consumer.Consumer.log")
     def test_script_with_output(self, mocked_log):
         """
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
 
                 m.assert_called_once()
 
-                args, kwargs = m.call_args
+                _, kwargs = m.call_args
 
                 command = kwargs["args"]
+                environment = kwargs["env"]
 
                 self.assertEqual(command[0], script.name)
                 self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
                 self.assertEqual(command[7], "my_bank")
                 self.assertCountEqual(command[8].split(","), ["a", "b"])
 
+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_ID": str(doc.pk),
+                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                        "DOCUMENT_CORRESPONDENT": "my_bank",
+                        "DOCUMENT_TAGS": "a,b",
+                    },
+                    environment,
+                )
+
     def test_script_exit_non_zero(self):
         """
         GIVEN: