]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Feature: auto-clean some invalid pdfs (#7651)
authorshamoon <4887959+shamoon@users.noreply.github.com>
Wed, 25 Sep 2024 15:57:20 +0000 (08:57 -0700)
committerGitHub <noreply@github.com>
Wed, 25 Sep 2024 15:57:20 +0000 (15:57 +0000)
docs/advanced_usage.md
src/documents/consumer.py
src/documents/serialisers.py
src/documents/tests/samples/invalid_pdf.pdf [new file with mode: 0644]
src/documents/tests/test_api_documents.py
src/documents/tests/test_consumer.py
src/paperless/settings.py

index 30687680cd837cb10bbb93c95afed2a8cc895448..fe8d2e305334dbc8db49d221f088ef7bdaaf9730 100644 (file)
@@ -418,6 +418,11 @@ Insurances/                             # Insurances
     Defining a storage path is optional. If no storage path is defined for a
     document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.
 
+## Automatic recovery of invalid PDFs {#pdf-recovery}
+
+Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
+detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
+
 ## Celery Monitoring {#celery-monitoring}
 
 The monitoring tool
index 57277e4a6e7859168ab0cd1336e034546eb8aa82..97910e24b418d296281f97c203800977f873ef2c 100644 (file)
@@ -532,6 +532,7 @@ class ConsumerPlugin(
             )
             self.working_copy = Path(tempdir.name) / Path(self.filename)
             copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+            self.unmodified_original = None
 
             # Determine the parser class.
 
@@ -539,6 +540,37 @@ class ConsumerPlugin(
 
             self.log.debug(f"Detected mime type: {mime_type}")
 
+            if (
+                Path(self.filename).suffix.lower() == ".pdf"
+                and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+            ):
+                try:
+                    # The file might be a pdf, but the mime type is wrong.
+                    # Try to clean with qpdf
+                    self.log.debug(
+                        "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+                    )
+                    run_subprocess(
+                        [
+                            "qpdf",
+                            "--replace-input",
+                            self.working_copy,
+                        ],
+                        logger=self.log,
+                    )
+                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+                    # Save the original file for later
+                    self.unmodified_original = (
+                        Path(tempdir.name) / Path("uo") / Path(self.filename)
+                    )
+                    copy_file_with_basic_stats(
+                        self.input_doc.original_file,
+                        self.unmodified_original,
+                    )
+                except Exception as e:
+                    self.log.error(f"Error attempting to clean PDF: {e}")
+
             # Based on the mime type, get the parser for that type
             parser_class: Optional[type[DocumentParser]] = (
                 get_parser_class_for_mime_type(
@@ -689,7 +721,9 @@ class ConsumerPlugin(
 
                     self._write(
                         document.storage_type,
-                        self.working_copy,
+                        self.unmodified_original
+                        if self.unmodified_original is not None
+                        else self.working_copy,
                         document.source_path,
                     )
 
@@ -725,6 +759,8 @@ class ConsumerPlugin(
                 self.log.debug(f"Deleting file {self.working_copy}")
                 self.input_doc.original_file.unlink()
                 self.working_copy.unlink()
+                if self.unmodified_original is not None:  # pragma: no cover
+                    self.unmodified_original.unlink()
 
                 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                 shadow_file = os.path.join(
index 737d1256f148c56bb819cc26cf1cb84beae60ac6..30f3dd26d140d4ba6c9b498c76bb7c5d1d2bbaa9 100644 (file)
@@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer):
         mime_type = magic.from_buffer(document_data, mime=True)
 
         if not is_mime_type_supported(mime_type):
-            raise serializers.ValidationError(
-                _("File type %(type)s not supported") % {"type": mime_type},
-            )
+            if (
+                mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+                and document.name.endswith(
+                    ".pdf",
+                )
+            ):
+                # If the file is an invalid PDF, we can try to recover it later in the consumer
+                mime_type = "application/pdf"
+            else:
+                raise serializers.ValidationError(
+                    _("File type %(type)s not supported") % {"type": mime_type},
+                )
 
         return document.name, document_data
 
diff --git a/src/documents/tests/samples/invalid_pdf.pdf b/src/documents/tests/samples/invalid_pdf.pdf
new file mode 100644 (file)
index 0000000..f226c2d
Binary files /dev/null and b/src/documents/tests/samples/invalid_pdf.pdf differ
index ee2e8ee1ee6d4353f902f71c5f50bdcd28a7189e..b1cd43932ed15c813efd247d77c4805fcb1deb84 100644 (file)
@@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         self.assertEqual(overrides.filename, "simple.pdf")
         self.assertEqual(overrides.custom_field_ids, [custom_field.id])
 
+    def test_upload_invalid_pdf(self):
+        """
+        GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+        WHEN: Upload the file
+        THEN: The file is not rejected
+        """
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )
+
+        with open(
+            os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
+            "rb",
+        ) as f:
+            response = self.client.post(
+                "/api/documents/post_document/",
+                {"document": f},
+            )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
     def test_get_metadata(self):
         doc = Document.objects.create(
             title="test",
index 5b56e2ccaad0f7130a6a5d44feb9c953fde6a6bf..aa452e15b8e875a63df30ae63a5b60bf305dba73 100644 (file)
@@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser):
 
 def fake_magic_from_file(file, mime=False):
     if mime:
+        if file.name.startswith("invalid_pdf"):
+            return "application/octet-stream"
         if os.path.splitext(file)[1] == ".pdf":
             return "application/pdf"
         elif os.path.splitext(file)[1] == ".png":
@@ -952,6 +954,27 @@ class TestConsumer(
 
         sanity_check()
 
+    @mock.patch("documents.consumer.run_subprocess")
+    def test_try_to_clean_invalid_pdf(self, m):
+        shutil.copy(
+            Path(__file__).parent / "samples" / "invalid_pdf.pdf",
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        )
+        with self.get_consumer(
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        ) as consumer:
+            # fails because no qpdf
+            self.assertRaises(ConsumerError, consumer.run)
+
+            m.assert_called_once()
+
+            args, _ = m.call_args
+
+            command = args[0]
+
+            self.assertEqual(command[0], "qpdf")
+            self.assertEqual(command[1], "--replace-input")
+
 
 @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
index 851fe62172e622aa4b16079d5a54448692c1be46..2da0b49f1db9f27d5efa05e87441e726c079c966 100644 (file)
@@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
     "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
 )
 
+CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
+
 OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
 
 # The default language that tesseract will attempt to use when parsing