Chore: Standardize subprocess running and logging (#6275)

author Trenton H <797416+stumpylog@users.noreply.github.com>

Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)

committer GitHub <noreply@github.com>

Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
committer GitHub <noreply@github.com>
Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
diff --git a/src/documents/consumer.py b/src/documents/consumer.py

index 2a0e5dce21fc19dd2f5b08dd75128baf90896d77..c735ed4c814b2fca4e617e1c33cb1ae0724d64e2 100644 (file)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -5,8 +5,6 @@ import tempfile
  import uuid
  from enum import Enum
  from pathlib import Path
-from subprocess import CompletedProcess
-from subprocess import run
  from typing import TYPE_CHECKING
  from typing import Optional
  
@@ -51,6 +49,7 @@ from documents.signals import document_consumption_finished
  from documents.signals import document_consumption_started
  from documents.utils import copy_basic_file_stats
  from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
  
  
  class WorkflowTriggerPlugin(
@@ -397,20 +396,15 @@ class Consumer(LoggingMixin):
          script_env["TASK_ID"] = self.task_id or ""
  
          try:
-            completed_proc = run(
-                args=[
+            run_subprocess(
+                [
                      settings.PRE_CONSUME_SCRIPT,
                      original_file_path,
                  ],
-                env=script_env,
-                capture_output=True,
+                script_env,
+                self.log,
              )
  
-            self._log_script_outputs(completed_proc)
-
-            # Raises exception on non-zero output
-            completed_proc.check_returncode()
-
          except Exception as e:
              self._fail(
                  ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
@@ -468,8 +462,8 @@ class Consumer(LoggingMixin):
          script_env["TASK_ID"] = self.task_id or ""
  
          try:
-            completed_proc = run(
-                args=[
+            run_subprocess(
+                [
                      settings.POST_CONSUME_SCRIPT,
                      str(document.pk),
                      document.get_public_filename(),
@@ -480,15 +474,10 @@ class Consumer(LoggingMixin):
                      str(document.correspondent),
                      str(",".join(document.tags.all().values_list("name", flat=True))),
                  ],
-                env=script_env,
-                capture_output=True,
+                script_env,
+                self.log,
              )
  
-            self._log_script_outputs(completed_proc)
-
-            # Raises exception on non-zero output
-            completed_proc.check_returncode()
-
          except Exception as e:
              self._fail(
                  ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
@@ -929,41 +918,6 @@ class Consumer(LoggingMixin):
          except Exception:  # pragma: no cover
              pass
  
-    def _log_script_outputs(self, completed_process: CompletedProcess):
-        """
-        Decodes a process stdout and stderr streams and logs them to the main log
-        """
-        # Log what the script exited as
-        self.log.info(
-            f"{completed_process.args[0]} exited {completed_process.returncode}",
-        )
-
-        # Decode the output (if any)
-        if len(completed_process.stdout):
-            stdout_str = (
-                completed_process.stdout.decode("utf8", errors="ignore")
-                .strip()
-                .split(
-                    "\n",
-                )
-            )
-            self.log.info("Script stdout:")
-            for line in stdout_str:
-                self.log.info(line)
-
-        if len(completed_process.stderr):
-            stderr_str = (
-                completed_process.stderr.decode("utf8", errors="ignore")
-                .strip()
-                .split(
-                    "\n",
-                )
-            )
-
-            self.log.warning("Script stderr:")
-            for line in stderr_str:
-                self.log.warning(line)
-
  
  def parse_doc_title_w_placeholders(
      title: str,
diff --git a/src/documents/converters.py b/src/documents/converters.py

index 5c5ba1e078b10d60055eef37a2cf5bf9e67af775..fcd0a8857145046c457b9642a18e5b7424987f59 100644 (file)
--- a/src/documents/converters.py
+++ b/src/documents/converters.py
@@ -1,5 +1,4 @@
  from pathlib import Path
-from subprocess import run
  
  import img2pdf
  from django.conf import settings
@@ -7,6 +6,7 @@ from PIL import Image
  
  from documents.utils import copy_basic_file_stats
  from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
  
  
  def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
@@ -27,7 +27,7 @@ def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
          # Note the save into the temp folder, so as not to trigger a new
          # consume
          scratch_image = target_directory / tiff_path.name
-        run(
+        run_subprocess(
              [
                  settings.CONVERT_BINARY,
                  "-alpha",
diff --git a/src/documents/parsers.py b/src/documents/parsers.py

index a2b4677e692ab43f8c8f4f141e2c1e34a8a7aec9..d781ddb9f2c5a0757512623feab1ca0ea05e1075 100644 (file)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -18,6 +18,7 @@ from django.utils import timezone
  from documents.loggers import LoggingMixin
  from documents.signals import document_consumer_declaration
  from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
  
  # This regular expression will try to find dates in the document at
  # hand and will match the following formats:
@@ -164,8 +165,12 @@ def run_convert(
  
      logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
  
-    if not subprocess.Popen(args, env=environment).wait() == 0:
-        raise ParseError(f"Convert failed at {args}")
+    try:
+        run_subprocess(args, environment, logger)
+    except subprocess.CalledProcessError as e:
+        raise ParseError(f"Convert failed at {args}") from e
+    except Exception as e:  # pragma: no cover
+        raise ParseError("Unknown error running convert") from e
  
  
  def get_default_thumbnail() -> Path:
@@ -188,9 +193,12 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
      # Ghostscript doesn't handle WebP outputs
      gs_out_path = os.path.join(temp_dir, "gs_out.png")
      cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
+
      try:
-        if not subprocess.Popen(cmd).wait() == 0:
-            raise ParseError(f"Thumbnail (gs) failed at {cmd}")
+        try:
+            run_subprocess(cmd, logger=logger)
+        except subprocess.CalledProcessError as e:
+            raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e
          # then run convert on the output from gs to make WebP
          run_convert(
              density=300,
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py

index f7726597075869116d036f4047086b78669736bf..6c23df8aab737467de10dd7b40a3dbdbf27a58ec 100644 (file)
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -934,7 +934,7 @@ class PreConsumeTestCase(TestCase):
  
          return super().setUp()
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      @override_settings(PRE_CONSUME_SCRIPT=None)
      def test_no_pre_consume_script(self, m):
          c = Consumer()
@@ -942,7 +942,7 @@ class PreConsumeTestCase(TestCase):
          c.run_pre_consume_script()
          m.assert_not_called()
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      @mock.patch("documents.consumer.Consumer._send_progress")
      @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
      def test_pre_consume_script_not_found(self, m, m2):
@@ -951,7 +951,7 @@ class PreConsumeTestCase(TestCase):
          c.working_copy = "path-to-file"
          self.assertRaises(ConsumerError, c.run_pre_consume_script)
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      def test_pre_consume_script(self, m):
          with tempfile.NamedTemporaryFile() as script:
              with override_settings(PRE_CONSUME_SCRIPT=script.name):
@@ -963,10 +963,10 @@ class PreConsumeTestCase(TestCase):
  
                  m.assert_called_once()
  
-                args, kwargs = m.call_args
+                args, _ = m.call_args
  
-                command = kwargs["args"]
-                environment = kwargs["env"]
+                command = args[0]
+                environment = args[1]
  
                  self.assertEqual(command[0], script.name)
                  self.assertEqual(command[1], "path-to-file")
@@ -1050,7 +1050,7 @@ class PostConsumeTestCase(TestCase):
  
          return super().setUp()
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      @override_settings(POST_CONSUME_SCRIPT=None)
      def test_no_post_consume_script(self, m):
          doc = Document.objects.create(title="Test", mime_type="application/pdf")
@@ -1075,7 +1075,7 @@ class PostConsumeTestCase(TestCase):
              doc,
          )
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      def test_post_consume_script_simple(self, m):
          with tempfile.NamedTemporaryFile() as script:
              with override_settings(POST_CONSUME_SCRIPT=script.name):
@@ -1085,7 +1085,7 @@ class PostConsumeTestCase(TestCase):
  
                  m.assert_called_once()
  
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
      def test_post_consume_script_with_correspondent(self, m):
          with tempfile.NamedTemporaryFile() as script:
              with override_settings(POST_CONSUME_SCRIPT=script.name):
@@ -1106,10 +1106,10 @@ class PostConsumeTestCase(TestCase):
  
                  m.assert_called_once()
  
-                _, kwargs = m.call_args
+                args, _ = m.call_args
  
-                command = kwargs["args"]
-                environment = kwargs["env"]
+                command = args[0]
+                environment = args[1]
  
                  self.assertEqual(command[0], script.name)
                  self.assertEqual(command[1], str(doc.pk))
diff --git a/src/documents/utils.py b/src/documents/utils.py

index 29f4de14df3185a908e591e2b10084ffe2ccab9d..0af1f54e374d32dd25048253fac32ee4f8739cfc 100644 (file)
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,6 +1,9 @@
+import logging
  import shutil
  from os import utime
  from pathlib import Path
+from subprocess import CompletedProcess
+from subprocess import run
  from typing import Optional
  from typing import Union
  
@@ -56,3 +59,54 @@ def maybe_override_pixel_limit() -> None:
          if pixel_count == 0:
              pixel_count = None
          Image.MAX_IMAGE_PIXELS = pixel_count
+
+
+def run_subprocess(
+    arguments: list[str],
+    env: Optional[dict[str, str]] = None,
+    logger: Optional[logging.Logger] = None,
+    *,
+    check_exit_code: bool = True,
+    log_stdout: bool = True,
+    log_stderr: bool = True,
+) -> CompletedProcess:
+    """
+    Runs a subprocess and logs its output, checking return code if requested
+    """
+
+    proc_name = arguments[0]
+
+    completed_proc = run(args=arguments, env=env, capture_output=True, check=False)
+
+    if logger:
+        logger.info(f"{proc_name} exited {completed_proc.returncode}")
+
+    if log_stdout and logger and completed_proc.stdout:
+        stdout_str = (
+            completed_proc.stdout.decode("utf8", errors="ignore")
+            .strip()
+            .split(
+                "\n",
+            )
+        )
+        logger.info(f"{proc_name} stdout:")
+        for line in stdout_str:
+            logger.info(line)
+
+    if log_stderr and logger and completed_proc.stderr:
+        stderr_str = (
+            completed_proc.stderr.decode("utf8", errors="ignore")
+            .strip()
+            .split(
+                "\n",
+            )
+        )
+        logger.info(f"{proc_name} stderr:")
+        for line in stderr_str:
+            logger.warning(line)
+
+    # Last, if requested, after logging outputs
+    if check_exit_code:
+        completed_proc.check_returncode()
+
+    return completed_proc
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index c483a3da4d637516f63a3cd5240ae096a6a40f41..44d421edc71f4797d2715e832b67870f139e2068 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,6 +1,5 @@
  import os
  import re
-import subprocess
  import tempfile
  from pathlib import Path
  from typing import TYPE_CHECKING
@@ -13,6 +12,7 @@ from documents.parsers import DocumentParser
  from documents.parsers import ParseError
  from documents.parsers import make_thumbnail_from_pdf
  from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
  from paperless.config import OcrConfig
  from paperless.models import ArchiveFileChoices
  from paperless.models import CleanChoices
@@ -103,7 +103,7 @@ class RasterisedDocumentParser(DocumentParser):
  
      def remove_alpha(self, image_path: str) -> Path:
          no_alpha_image = Path(self.tempdir) / "image-no-alpha"
-        subprocess.run(
+        run_subprocess(
              [
                  settings.CONVERT_BINARY,
                  "-alpha",
@@ -111,6 +111,7 @@ class RasterisedDocumentParser(DocumentParser):
                  image_path,
                  no_alpha_image,
              ],
+            logger=self.log,
          )
          return no_alpha_image
  
@@ -169,7 +170,7 @@ class RasterisedDocumentParser(DocumentParser):
                  mode="w+",
                  dir=self.tempdir,
              ) as tmp:
-                subprocess.run(
+                run_subprocess(
                      [
                          "pdftotext",
                          "-q",
@@ -179,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
                          pdf_file,
                          tmp.name,
                      ],
+                    logger=self.log,
                  )
                  text = self.read_file_handle_unicode_errors(Path(tmp.name))
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
committer	GitHub <noreply@github.com>
	Thu, 4 Apr 2024 20:11:43 +0000 (13:11 -0700)
src/documents/consumer.py		patch \| blob \| blame \| history
src/documents/converters.py		patch \| blob \| blame \| history
src/documents/parsers.py		patch \| blob \| blame \| history
src/documents/tests/test_consumer.py		patch \| blob \| blame \| history
src/documents/utils.py		patch \| blob \| blame \| history
src/paperless_tesseract/parsers.py		patch \| blob \| blame \| history