]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Move it out of consumer
authorshamoon <4887959+shamoon@users.noreply.github.com>
Thu, 7 Nov 2024 21:19:06 +0000 (13:19 -0800)
committershamoon <4887959+shamoon@users.noreply.github.com>
Thu, 7 Nov 2024 21:19:06 +0000 (13:19 -0800)
src/documents/consumer.py
src/documents/tasks.py
src/documents/tests/samples/corrupted.pdf
src/documents/tests/test_tasks.py

index ca8e2d37876dcf6b394c788c990269be4edf8382..a916afb2d7c6d3642f811512c237fe752c13245f 100644 (file)
@@ -808,50 +808,3 @@ class ConsumerPlugin(
             copy_basic_file_stats(source, target)
         except Exception:  # pragma: no cover
             pass
-
-
-class CleanPDFPlugin(
-    NoCleanupPluginMixin,
-    NoSetupPluginMixin,
-    AlwaysRunPluginMixin,
-    LoggingMixin,
-    ConsumeTaskPlugin,
-):
-    NAME: str = "CleanPDFPlugin"
-    logging_name = "paperless.consumer"
-
-    def __init__(
-        self,
-        input_doc: ConsumableDocument,
-        metadata: DocumentMetadataOverrides,
-        status_mgr: ProgressManager,
-        base_tmp_dir: Path,
-        task_id: str,
-    ) -> None:
-        super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id)
-
-        self.renew_logging_group()
-
-    def run(self) -> str | None:
-        """
-        Tries to clean a PDF file with qpdf
-        """
-        msg = None
-        try:
-            result = run_subprocess(
-                [
-                    "qpdf",
-                    "--replace-input",
-                    self.input_doc.original_file,
-                ],
-                logger=self.log,
-            )
-            msg = (
-                f"Error while cleaning PDF: {result.stderr}"
-                if result.returncode != 0
-                else "PDF cleaned successfully"
-            )
-        except Exception as e:
-            msg = "Error while cleaning PDF"
-            self.log.error(e)
-        return msg
index 7799abe2aadbc58118c826894f83a63d678a2e1a..c2b7194e2ce28655fb15e1a7dde4fab672420bdf 100644 (file)
@@ -24,7 +24,6 @@ from documents.barcodes import BarcodePlugin
 from documents.caching import clear_document_caches
 from documents.classifier import DocumentClassifier
 from documents.classifier import load_classifier
-from documents.consumer import CleanPDFPlugin
 from documents.consumer import ConsumerPlugin
 from documents.consumer import WorkflowTriggerPlugin
 from documents.data_models import ConsumableDocument
@@ -49,6 +48,7 @@ from documents.sanity_checker import SanityCheckFailedException
 from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
 
 if settings.AUDIT_LOG_ENABLED:
     from auditlog.models import LogEntry
@@ -111,7 +111,6 @@ def consume_file(
     self: Task,
     input_doc: ConsumableDocument,
     overrides: DocumentMetadataOverrides | None = None,
-    clean: bool = False,
 ):
     # Default no overrides
     if overrides is None:
@@ -124,9 +123,6 @@ def consume_file(
         ConsumerPlugin,
     ]
 
-    if clean:
-        plugins.insert(0, CleanPDFPlugin)
-
     with (
         ProgressManager(
             overrides.filename or input_doc.original_file.name,
@@ -189,13 +185,32 @@ def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False)
         working_copy = settings.SCRATCH_DIR / failed_file.name
         copy_file_with_basic_stats(failed_file, working_copy)
 
+        if clean:
+            try:
+                result = run_subprocess(
+                    [
+                        "qpdf",
+                        "--replace-input",
+                        "--warning-exit-0",
+                        working_copy,
+                    ],
+                    logger=logger,
+                )
+                if result.returncode != 0:
+                    raise Exception(
+                        f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
+                    )
+                else:
+                    logger.debug("PDF cleaned successfully")
+            except Exception as e:
+                logger.error(f"Error while cleaning PDF: {e}")
+                return
+
         consume_file(
             ConsumableDocument(
                 source=DocumentSource.ConsumeFolder,
                 original_file=working_copy,
             ),
-            clean=clean,
-            # skip_ocr=skip_ocr,
         )
 
 
index fcabb6a95c452a362e916f36f578d9841cfe7de0..61a0274b2b55a88f585f439cd7e412dab472a9f6 100644 (file)
Binary files a/src/documents/tests/samples/corrupted.pdf and b/src/documents/tests/samples/corrupted.pdf differ
index 8dc1edc397b0758cf94ef9f40e287d92480f61dd..210cbd11113042b3addbee717ebeb559a4ef091c 100644 (file)
@@ -248,6 +248,6 @@ class TestRetryConsumeTask(
         self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
 
         with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
-            with self.assertLogs("documents.tasks", level="INFO") as cm:
+            with self.assertLogs() as cm:
                 tasks.retry_failed_file(task_id=task.task_id, clean=True)
-                self.assertIn("PDF cleaned successfully", cm.output[0])
+                self.assertIn("New document id 1 created", cm.output[-1])