copy_basic_file_stats(source, target)
except Exception: # pragma: no cover
pass
-
-
-class CleanPDFPlugin(
- NoCleanupPluginMixin,
- NoSetupPluginMixin,
- AlwaysRunPluginMixin,
- LoggingMixin,
- ConsumeTaskPlugin,
-):
- NAME: str = "CleanPDFPlugin"
- logging_name = "paperless.consumer"
-
- def __init__(
- self,
- input_doc: ConsumableDocument,
- metadata: DocumentMetadataOverrides,
- status_mgr: ProgressManager,
- base_tmp_dir: Path,
- task_id: str,
- ) -> None:
- super().__init__(input_doc, metadata, status_mgr, base_tmp_dir, task_id)
-
- self.renew_logging_group()
-
- def run(self) -> str | None:
- """
- Tries to clean a PDF file with qpdf
- """
- msg = None
- try:
- result = run_subprocess(
- [
- "qpdf",
- "--replace-input",
- self.input_doc.original_file,
- ],
- logger=self.log,
- )
- msg = (
- f"Error while cleaning PDF: {result.stderr}"
- if result.returncode != 0
- else "PDF cleaned successfully"
- )
- except Exception as e:
- msg = "Error while cleaning PDF"
- self.log.error(e)
- return msg
from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
-from documents.consumer import CleanPDFPlugin
from documents.consumer import ConsumerPlugin
from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument
from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry
self: Task,
input_doc: ConsumableDocument,
overrides: DocumentMetadataOverrides | None = None,
- clean: bool = False,
):
# Default no overrides
if overrides is None:
ConsumerPlugin,
]
- if clean:
- plugins.insert(0, CleanPDFPlugin)
-
with (
ProgressManager(
overrides.filename or input_doc.original_file.name,
working_copy = settings.SCRATCH_DIR / failed_file.name
copy_file_with_basic_stats(failed_file, working_copy)
+ if clean:
+ try:
+ result = run_subprocess(
+ [
+ "qpdf",
+ "--replace-input",
+ "--warning-exit-0",
+ working_copy,
+ ],
+ logger=logger,
+ )
+ if result.returncode != 0:
+ raise Exception(
+ f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
+ )
+ else:
+ logger.debug("PDF cleaned successfully")
+ except Exception as e:
+ logger.error(f"Error while cleaning PDF: {e}")
+ return
+
consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=working_copy,
),
- clean=clean,
- # skip_ocr=skip_ocr,
)
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
- with self.assertLogs("documents.tasks", level="INFO") as cm:
+ with self.assertLogs() as cm:
tasks.retry_failed_file(task_id=task.task_id, clean=True)
- self.assertIn("PDF cleaned successfully", cm.output[0])
+ self.assertIn("New document id 1 created", cm.output[-1])