]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Chore: Standardize subprocess running and logging (#6275)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
committerGitHub <noreply@github.com>
Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
src/documents/consumer.py
src/documents/converters.py
src/documents/parsers.py
src/documents/tests/test_consumer.py
src/documents/utils.py
src/paperless_tesseract/parsers.py

index 2a0e5dce21fc19dd2f5b08dd75128baf90896d77..c735ed4c814b2fca4e617e1c33cb1ae0724d64e2 100644 (file)
@@ -5,8 +5,6 @@ import tempfile
 import uuid
 from enum import Enum
 from pathlib import Path
-from subprocess import CompletedProcess
-from subprocess import run
 from typing import TYPE_CHECKING
 from typing import Optional
 
@@ -51,6 +49,7 @@ from documents.signals import document_consumption_finished
 from documents.signals import document_consumption_started
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
 
 
 class WorkflowTriggerPlugin(
@@ -397,20 +396,15 @@ class Consumer(LoggingMixin):
         script_env["TASK_ID"] = self.task_id or ""
 
         try:
-            completed_proc = run(
-                args=[
+            run_subprocess(
+                [
                     settings.PRE_CONSUME_SCRIPT,
                     original_file_path,
                 ],
-                env=script_env,
-                capture_output=True,
+                script_env,
+                self.log,
             )
 
-            self._log_script_outputs(completed_proc)
-
-            # Raises exception on non-zero output
-            completed_proc.check_returncode()
-
         except Exception as e:
             self._fail(
                 ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
@@ -468,8 +462,8 @@ class Consumer(LoggingMixin):
         script_env["TASK_ID"] = self.task_id or ""
 
         try:
-            completed_proc = run(
-                args=[
+            run_subprocess(
+                [
                     settings.POST_CONSUME_SCRIPT,
                     str(document.pk),
                     document.get_public_filename(),
@@ -480,15 +474,10 @@ class Consumer(LoggingMixin):
                     str(document.correspondent),
                     str(",".join(document.tags.all().values_list("name", flat=True))),
                 ],
-                env=script_env,
-                capture_output=True,
+                script_env,
+                self.log,
             )
 
-            self._log_script_outputs(completed_proc)
-
-            # Raises exception on non-zero output
-            completed_proc.check_returncode()
-
         except Exception as e:
             self._fail(
                 ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
@@ -929,41 +918,6 @@ class Consumer(LoggingMixin):
         except Exception:  # pragma: no cover
             pass
 
-    def _log_script_outputs(self, completed_process: CompletedProcess):
-        """
-        Decodes a process stdout and stderr streams and logs them to the main log
-        """
-        # Log what the script exited as
-        self.log.info(
-            f"{completed_process.args[0]} exited {completed_process.returncode}",
-        )
-
-        # Decode the output (if any)
-        if len(completed_process.stdout):
-            stdout_str = (
-                completed_process.stdout.decode("utf8", errors="ignore")
-                .strip()
-                .split(
-                    "\n",
-                )
-            )
-            self.log.info("Script stdout:")
-            for line in stdout_str:
-                self.log.info(line)
-
-        if len(completed_process.stderr):
-            stderr_str = (
-                completed_process.stderr.decode("utf8", errors="ignore")
-                .strip()
-                .split(
-                    "\n",
-                )
-            )
-
-            self.log.warning("Script stderr:")
-            for line in stderr_str:
-                self.log.warning(line)
-
 
 def parse_doc_title_w_placeholders(
     title: str,
index 5c5ba1e078b10d60055eef37a2cf5bf9e67af775..fcd0a8857145046c457b9642a18e5b7424987f59 100644 (file)
@@ -1,5 +1,4 @@
 from pathlib import Path
-from subprocess import run
 
 import img2pdf
 from django.conf import settings
@@ -7,6 +6,7 @@ from PIL import Image
 
 from documents.utils import copy_basic_file_stats
 from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
 
 
 def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
@@ -27,7 +27,7 @@ def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
         # Note the save into the temp folder, so as not to trigger a new
         # consume
         scratch_image = target_directory / tiff_path.name
-        run(
+        run_subprocess(
             [
                 settings.CONVERT_BINARY,
                 "-alpha",
index a2b4677e692ab43f8c8f4f141e2c1e34a8a7aec9..d781ddb9f2c5a0757512623feab1ca0ea05e1075 100644 (file)
@@ -18,6 +18,7 @@ from django.utils import timezone
 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
 
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -164,8 +165,12 @@ def run_convert(
 
     logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
 
-    if not subprocess.Popen(args, env=environment).wait() == 0:
-        raise ParseError(f"Convert failed at {args}")
+    try:
+        run_subprocess(args, environment, logger)
+    except subprocess.CalledProcessError as e:
+        raise ParseError(f"Convert failed at {args}") from e
+    except Exception as e:  # pragma: no cover
+        raise ParseError("Unknown error running convert") from e
 
 
 def get_default_thumbnail() -> Path:
@@ -188,9 +193,12 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
     # Ghostscript doesn't handle WebP outputs
     gs_out_path = os.path.join(temp_dir, "gs_out.png")
     cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
+
     try:
-        if not subprocess.Popen(cmd).wait() == 0:
-            raise ParseError(f"Thumbnail (gs) failed at {cmd}")
+        try:
+            run_subprocess(cmd, logger=logger)
+        except subprocess.CalledProcessError as e:
+            raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e
         # then run convert on the output from gs to make WebP
         run_convert(
             density=300,
index f7726597075869116d036f4047086b78669736bf..6c23df8aab737467de10dd7b40a3dbdbf27a58ec 100644 (file)
@@ -934,7 +934,7 @@ class PreConsumeTestCase(TestCase):
 
         return super().setUp()
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     @override_settings(PRE_CONSUME_SCRIPT=None)
     def test_no_pre_consume_script(self, m):
         c = Consumer()
@@ -942,7 +942,7 @@ class PreConsumeTestCase(TestCase):
         c.run_pre_consume_script()
         m.assert_not_called()
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     @mock.patch("documents.consumer.Consumer._send_progress")
     @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
     def test_pre_consume_script_not_found(self, m, m2):
@@ -951,7 +951,7 @@ class PreConsumeTestCase(TestCase):
         c.working_copy = "path-to-file"
         self.assertRaises(ConsumerError, c.run_pre_consume_script)
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     def test_pre_consume_script(self, m):
         with tempfile.NamedTemporaryFile() as script:
             with override_settings(PRE_CONSUME_SCRIPT=script.name):
@@ -963,10 +963,10 @@ class PreConsumeTestCase(TestCase):
 
                 m.assert_called_once()
 
-                args, kwargs = m.call_args
+                args, _ = m.call_args
 
-                command = kwargs["args"]
-                environment = kwargs["env"]
+                command = args[0]
+                environment = args[1]
 
                 self.assertEqual(command[0], script.name)
                 self.assertEqual(command[1], "path-to-file")
@@ -1050,7 +1050,7 @@ class PostConsumeTestCase(TestCase):
 
         return super().setUp()
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     @override_settings(POST_CONSUME_SCRIPT=None)
     def test_no_post_consume_script(self, m):
         doc = Document.objects.create(title="Test", mime_type="application/pdf")
@@ -1075,7 +1075,7 @@ class PostConsumeTestCase(TestCase):
             doc,
         )
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     def test_post_consume_script_simple(self, m):
         with tempfile.NamedTemporaryFile() as script:
             with override_settings(POST_CONSUME_SCRIPT=script.name):
@@ -1085,7 +1085,7 @@ class PostConsumeTestCase(TestCase):
 
                 m.assert_called_once()
 
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
     def test_post_consume_script_with_correspondent(self, m):
         with tempfile.NamedTemporaryFile() as script:
             with override_settings(POST_CONSUME_SCRIPT=script.name):
@@ -1106,10 +1106,10 @@ class PostConsumeTestCase(TestCase):
 
                 m.assert_called_once()
 
-                _, kwargs = m.call_args
+                args, _ = m.call_args
 
-                command = kwargs["args"]
-                environment = kwargs["env"]
+                command = args[0]
+                environment = args[1]
 
                 self.assertEqual(command[0], script.name)
                 self.assertEqual(command[1], str(doc.pk))
index 29f4de14df3185a908e591e2b10084ffe2ccab9d..0af1f54e374d32dd25048253fac32ee4f8739cfc 100644 (file)
@@ -1,6 +1,9 @@
+import logging
 import shutil
 from os import utime
 from pathlib import Path
+from subprocess import CompletedProcess
+from subprocess import run
 from typing import Optional
 from typing import Union
 
@@ -56,3 +59,54 @@ def maybe_override_pixel_limit() -> None:
         if pixel_count == 0:
             pixel_count = None
         Image.MAX_IMAGE_PIXELS = pixel_count
+
+
+def run_subprocess(
+    arguments: list[str],
+    env: Optional[dict[str, str]] = None,
+    logger: Optional[logging.Logger] = None,
+    *,
+    check_exit_code: bool = True,
+    log_stdout: bool = True,
+    log_stderr: bool = True,
+) -> CompletedProcess:
+    """
+    Runs a subprocess and logs its output, checking return code if requested
+    """
+
+    proc_name = arguments[0]
+
+    completed_proc = run(args=arguments, env=env, capture_output=True, check=False)
+
+    if logger:
+        logger.info(f"{proc_name} exited {completed_proc.returncode}")
+
+    if log_stdout and logger and completed_proc.stdout:
+        stdout_str = (
+            completed_proc.stdout.decode("utf8", errors="ignore")
+            .strip()
+            .split(
+                "\n",
+            )
+        )
+        logger.info(f"{proc_name} stdout:")
+        for line in stdout_str:
+            logger.info(line)
+
+    if log_stderr and logger and completed_proc.stderr:
+        stderr_str = (
+            completed_proc.stderr.decode("utf8", errors="ignore")
+            .strip()
+            .split(
+                "\n",
+            )
+        )
+        logger.info(f"{proc_name} stderr:")
+        for line in stderr_str:
+            logger.warning(line)
+
+    # Last, if requested, after logging outputs
+    if check_exit_code:
+        completed_proc.check_returncode()
+
+    return completed_proc
index c483a3da4d637516f63a3cd5240ae096a6a40f41..44d421edc71f4797d2715e832b67870f139e2068 100644 (file)
@@ -1,6 +1,5 @@
 import os
 import re
-import subprocess
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -13,6 +12,7 @@ from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
@@ -103,7 +103,7 @@ class RasterisedDocumentParser(DocumentParser):
 
     def remove_alpha(self, image_path: str) -> Path:
         no_alpha_image = Path(self.tempdir) / "image-no-alpha"
-        subprocess.run(
+        run_subprocess(
             [
                 settings.CONVERT_BINARY,
                 "-alpha",
@@ -111,6 +111,7 @@ class RasterisedDocumentParser(DocumentParser):
                 image_path,
                 no_alpha_image,
             ],
+            logger=self.log,
         )
         return no_alpha_image
 
@@ -169,7 +170,7 @@ class RasterisedDocumentParser(DocumentParser):
                 mode="w+",
                 dir=self.tempdir,
             ) as tmp:
-                subprocess.run(
+                run_subprocess(
                     [
                         "pdftotext",
                         "-q",
@@ -179,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
                         pdf_file,
                         tmp.name,
                     ],
+                    logger=self.log,
                 )
                 text = self.read_file_handle_unicode_errors(Path(tmp.name))