]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Fix: Allows pre-consume scripts to modify the working path again (#5260)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
committerGitHub <noreply@github.com>
Sat, 6 Jan 2024 05:01:57 +0000 (21:01 -0800)
* Allows pre-consume scripts to modify the working path again and generally cleans up some confusion about working copy vs original

docs/advanced_usage.md
src/documents/consumer.py
src/documents/parsers.py
src/documents/tests/test_consumer.py
src/paperless_tesseract/parsers.py

index ca5ed4321886f103646c3ff58f8ca0bbe35214de..4bbd7753deab007371addc21aaba46d4510ce6c5 100644 (file)
@@ -136,6 +136,11 @@ script can access the following relevant environment variables set:
     be triggered, leading to failures as two tasks work on the
     same document path
 
+!!! warning
+
+    If your script modifies `DOCUMENT_WORKING_PATH` in a non-deterministic
+    way, this may allow duplicate documents to be stored
+
 A simple but common example for this would be creating a simple script
 like this:
 
index 11faeea433c49bab08d4274160dbcd65228ad811..b7a559575948d5d7b971088163ab4238227f92b8 100644 (file)
@@ -135,24 +135,24 @@ class Consumer(LoggingMixin):
         """
         Confirm the input file still exists where it should
         """
-        if not os.path.isfile(self.path):
+        if not os.path.isfile(self.original_path):
             self._fail(
                 ConsumerStatusShortMessage.FILE_NOT_FOUND,
-                f"Cannot consume {self.path}: File not found.",
+                f"Cannot consume {self.original_path}: File not found.",
             )
 
     def pre_check_duplicate(self):
         """
         Using the MD5 of the file, check this exact file doesn't already exist
         """
-        with open(self.path, "rb") as f:
+        with open(self.original_path, "rb") as f:
             checksum = hashlib.md5(f.read()).hexdigest()
         existing_doc = Document.objects.filter(
             Q(checksum=checksum) | Q(archive_checksum=checksum),
         )
         if existing_doc.exists():
             if settings.CONSUMER_DELETE_DUPLICATES:
-                os.unlink(self.path)
+                os.unlink(self.original_path)
             self._fail(
                 ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
                 f"Not consuming {self.filename}: It is a duplicate of"
@@ -211,7 +211,7 @@ class Consumer(LoggingMixin):
 
         self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
 
-        working_file_path = str(self.path)
+        working_file_path = str(self.working_copy)
         original_file_path = str(self.original_path)
 
         script_env = os.environ.copy()
@@ -343,8 +343,8 @@ class Consumer(LoggingMixin):
         Return the document object if it was successfully created.
         """
 
-        self.path = Path(path).resolve()
-        self.filename = override_filename or self.path.name
+        self.original_path = Path(path).resolve()
+        self.filename = override_filename or self.original_path.name
         self.override_title = override_title
         self.override_correspondent_id = override_correspondent_id
         self.override_document_type_id = override_document_type_id
@@ -377,17 +377,16 @@ class Consumer(LoggingMixin):
         self.log.info(f"Consuming {self.filename}")
 
         # For the actual work, copy the file into a tempdir
-        self.original_path = self.path
         tempdir = tempfile.TemporaryDirectory(
             prefix="paperless-ngx",
             dir=settings.SCRATCH_DIR,
         )
-        self.path = Path(tempdir.name) / Path(self.filename)
-        copy_file_with_basic_stats(self.original_path, self.path)
+        self.working_copy = Path(tempdir.name) / Path(self.filename)
+        copy_file_with_basic_stats(self.original_path, self.working_copy)
 
         # Determine the parser class.
 
-        mime_type = magic.from_file(self.path, mime=True)
+        mime_type = magic.from_file(self.working_copy, mime=True)
 
         self.log.debug(f"Detected mime type: {mime_type}")
 
@@ -406,7 +405,7 @@ class Consumer(LoggingMixin):
 
         document_consumption_started.send(
             sender=self.__class__,
-            filename=self.path,
+            filename=self.working_copy,
             logging_group=self.logging_group,
         )
 
@@ -444,7 +443,7 @@ class Consumer(LoggingMixin):
                 ConsumerStatusShortMessage.PARSING_DOCUMENT,
             )
             self.log.debug(f"Parsing {self.filename}...")
-            document_parser.parse(self.path, mime_type, self.filename)
+            document_parser.parse(self.working_copy, mime_type, self.filename)
 
             self.log.debug(f"Generating thumbnail for {self.filename}...")
             self._send_progress(
@@ -454,7 +453,7 @@ class Consumer(LoggingMixin):
                 ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
             )
             thumbnail = document_parser.get_thumbnail(
-                self.path,
+                self.working_copy,
                 mime_type,
                 self.filename,
             )
@@ -527,7 +526,7 @@ class Consumer(LoggingMixin):
 
                     self._write(
                         document.storage_type,
-                        self.original_path,
+                        self.working_copy,
                         document.source_path,
                     )
 
@@ -560,9 +559,9 @@ class Consumer(LoggingMixin):
                 document.save()
 
                 # Delete the file only if it was successfully consumed
-                self.log.debug(f"Deleting file {self.path}")
-                os.unlink(self.path)
+                self.log.debug(f"Deleting file {self.working_copy}")
                 self.original_path.unlink()
+                self.working_copy.unlink()
 
                 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                 shadow_file = os.path.join(
@@ -735,7 +734,7 @@ class Consumer(LoggingMixin):
             )[:127],
             content=text,
             mime_type=mime_type,
-            checksum=hashlib.md5(self.original_path.read_bytes()).hexdigest(),
+            checksum=hashlib.md5(self.working_copy.read_bytes()).hexdigest(),
             created=create_date,
             modified=create_date,
             storage_type=storage_type,
index 3215d49a614006ae0dcc4f964742c48da20e288c..b823ae9ceb44f7f11c051a0fb8919dfab463e623 100644 (file)
@@ -322,7 +322,9 @@ class DocumentParser(LoggingMixin):
         self.logging_group = logging_group
         self.settings = self.get_settings()
         os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+        self.tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
 
         self.archive_path = None
         self.text = None
index 1db90ee5476c1d9d07632104116032bb5a1117bd..24f7ff3388f7544411985f6d932397a846be5b11 100644 (file)
@@ -928,7 +928,7 @@ class PreConsumeTestCase(TestCase):
     @override_settings(PRE_CONSUME_SCRIPT=None)
     def test_no_pre_consume_script(self, m):
         c = Consumer()
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
         c.run_pre_consume_script()
         m.assert_not_called()
 
@@ -938,7 +938,7 @@ class PreConsumeTestCase(TestCase):
     def test_pre_consume_script_not_found(self, m, m2):
         c = Consumer()
         c.filename = "somefile.pdf"
-        c.path = "path-to-file"
+        c.working_copy = "path-to-file"
         self.assertRaises(ConsumerError, c.run_pre_consume_script)
 
     @mock.patch("documents.consumer.run")
@@ -947,7 +947,7 @@ class PreConsumeTestCase(TestCase):
             with override_settings(PRE_CONSUME_SCRIPT=script.name):
                 c = Consumer()
                 c.original_path = "path-to-file"
-                c.path = "/tmp/somewhere/path-to-file"
+                c.working_copy = "/tmp/somewhere/path-to-file"
                 c.task_id = str(uuid.uuid4())
                 c.run_pre_consume_script()
 
@@ -963,7 +963,7 @@ class PreConsumeTestCase(TestCase):
 
                 subset = {
                     "DOCUMENT_SOURCE_PATH": c.original_path,
-                    "DOCUMENT_WORKING_PATH": c.path,
+                    "DOCUMENT_WORKING_PATH": c.working_copy,
                     "TASK_ID": c.task_id,
                 }
                 self.assertDictEqual(environment, {**environment, **subset})
@@ -991,7 +991,7 @@ class PreConsumeTestCase(TestCase):
             with override_settings(PRE_CONSUME_SCRIPT=script.name):
                 with self.assertLogs("paperless.consumer", level="INFO") as cm:
                     c = Consumer()
-                    c.path = "path-to-file"
+                    c.working_copy = "path-to-file"
 
                     c.run_pre_consume_script()
                     self.assertIn(
@@ -1024,7 +1024,7 @@ class PreConsumeTestCase(TestCase):
 
             with override_settings(PRE_CONSUME_SCRIPT=script.name):
                 c = Consumer()
-                c.path = "path-to-file"
+                c.working_copy = "path-to-file"
                 self.assertRaises(
                     ConsumerError,
                     c.run_pre_consume_script,
index 047a171b2f2456c2194f357c5068c4862b573873..c699d8ea56d09ed2dcdeb20afefb69450012b39b 100644 (file)
@@ -90,16 +90,18 @@ class RasterisedDocumentParser(DocumentParser):
         with Image.open(image) as im:
             return im.mode in ("RGBA", "LA")
 
-    def remove_alpha(self, image_path: str):
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
         subprocess.run(
             [
                 settings.CONVERT_BINARY,
                 "-alpha",
                 "off",
                 image_path,
-                image_path,
+                no_alpha_image,
             ],
         )
+        return no_alpha_image
 
     def get_dpi(self, image) -> Optional[int]:
         try:
@@ -251,7 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
                     f"Removing alpha layer from {input_file} "
                     "for compatibility with img2pdf",
                 )
-                self.remove_alpha(input_file)
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
 
             if dpi:
                 self.log.debug(f"Detected DPI for image {input_file}: {dpi}")