Feature: auto-clean some invalid pdfs (#7651)

author shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 25 Sep 2024 15:57:20 +0000 (08:57 -0700)

committer GitHub <noreply@github.com>

Wed, 25 Sep 2024 15:57:20 +0000 (15:57 +0000)
author shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 25 Sep 2024 15:57:20 +0000 (08:57 -0700)
committer GitHub <noreply@github.com>
Wed, 25 Sep 2024 15:57:20 +0000 (15:57 +0000)
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md

index 30687680cd837cb10bbb93c95afed2a8cc895448..fe8d2e305334dbc8db49d221f088ef7bdaaf9730 100644 (file)
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -418,6 +418,11 @@ Insurances/                             # Insurances
      Defining a storage path is optional. If no storage path is defined for a
      document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.
  
+## Automatic recovery of invalid PDFs {#pdf-recovery}
+
+Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
+detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
+
  ## Celery Monitoring {#celery-monitoring}
  
  The monitoring tool
diff --git a/src/documents/consumer.py b/src/documents/consumer.py

index 57277e4a6e7859168ab0cd1336e034546eb8aa82..97910e24b418d296281f97c203800977f873ef2c 100644 (file)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -532,6 +532,7 @@ class ConsumerPlugin(
              )
              self.working_copy = Path(tempdir.name) / Path(self.filename)
              copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+            self.unmodified_original = None
  
              # Determine the parser class.
  
@@ -539,6 +540,37 @@ class ConsumerPlugin(
  
              self.log.debug(f"Detected mime type: {mime_type}")
  
+            if (
+                Path(self.filename).suffix.lower() == ".pdf"
+                and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+            ):
+                try:
+                    # The file might be a pdf, but the mime type is wrong.
+                    # Try to clean with qpdf
+                    self.log.debug(
+                        "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+                    )
+                    run_subprocess(
+                        [
+                            "qpdf",
+                            "--replace-input",
+                            self.working_copy,
+                        ],
+                        logger=self.log,
+                    )
+                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+                    # Save the original file for later
+                    self.unmodified_original = (
+                        Path(tempdir.name) / Path("uo") / Path(self.filename)
+                    )
+                    copy_file_with_basic_stats(
+                        self.input_doc.original_file,
+                        self.unmodified_original,
+                    )
+                except Exception as e:
+                    self.log.error(f"Error attempting to clean PDF: {e}")
+
              # Based on the mime type, get the parser for that type
              parser_class: Optional[type[DocumentParser]] = (
                  get_parser_class_for_mime_type(
@@ -689,7 +721,9 @@ class ConsumerPlugin(
  
                      self._write(
                          document.storage_type,
-                        self.working_copy,
+                        self.unmodified_original
+                        if self.unmodified_original is not None
+                        else self.working_copy,
                          document.source_path,
                      )
  
@@ -725,6 +759,8 @@ class ConsumerPlugin(
                  self.log.debug(f"Deleting file {self.working_copy}")
                  self.input_doc.original_file.unlink()
                  self.working_copy.unlink()
+                if self.unmodified_original is not None:  # pragma: no cover
+                    self.unmodified_original.unlink()
  
                  # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                  shadow_file = os.path.join(
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py

index 737d1256f148c56bb819cc26cf1cb84beae60ac6..30f3dd26d140d4ba6c9b498c76bb7c5d1d2bbaa9 100644 (file)
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1389,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer):
          mime_type = magic.from_buffer(document_data, mime=True)
  
          if not is_mime_type_supported(mime_type):
-            raise serializers.ValidationError(
-                _("File type %(type)s not supported") % {"type": mime_type},
-            )
+            if (
+                mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+                and document.name.endswith(
+                    ".pdf",
+                )
+            ):
+                # If the file is an invalid PDF, we can try to recover it later in the consumer
+                mime_type = "application/pdf"
+            else:
+                raise serializers.ValidationError(
+                    _("File type %(type)s not supported") % {"type": mime_type},
+                )
  
          return document.name, document_data
  
diff --git a/src/documents/tests/samples/invalid_pdf.pdf b/src/documents/tests/samples/invalid_pdf.pdf

new file mode 100644 (file)

index 0000000..f226c2d

Binary files /dev/null and b/src/documents/tests/samples/invalid_pdf.pdf differ
diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py

index ee2e8ee1ee6d4353f902f71c5f50bdcd28a7189e..b1cd43932ed15c813efd247d77c4805fcb1deb84 100644 (file)
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
          self.assertEqual(overrides.filename, "simple.pdf")
          self.assertEqual(overrides.custom_field_ids, [custom_field.id])
  
+    def test_upload_invalid_pdf(self):
+        """
+        GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+        WHEN: Upload the file
+        THEN: The file is not rejected
+        """
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )
+
+        with open(
+            os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
+            "rb",
+        ) as f:
+            response = self.client.post(
+                "/api/documents/post_document/",
+                {"document": f},
+            )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
      def test_get_metadata(self):
          doc = Document.objects.create(
              title="test",
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py

index 5b56e2ccaad0f7130a6a5d44feb9c953fde6a6bf..aa452e15b8e875a63df30ae63a5b60bf305dba73 100644 (file)
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser):
  
  def fake_magic_from_file(file, mime=False):
      if mime:
+        if file.name.startswith("invalid_pdf"):
+            return "application/octet-stream"
          if os.path.splitext(file)[1] == ".pdf":
              return "application/pdf"
          elif os.path.splitext(file)[1] == ".png":
@@ -952,6 +954,27 @@ class TestConsumer(
  
          sanity_check()
  
+    @mock.patch("documents.consumer.run_subprocess")
+    def test_try_to_clean_invalid_pdf(self, m):
+        shutil.copy(
+            Path(__file__).parent / "samples" / "invalid_pdf.pdf",
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        )
+        with self.get_consumer(
+            settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+        ) as consumer:
+            # fails because no qpdf
+            self.assertRaises(ConsumerError, consumer.run)
+
+            m.assert_called_once()
+
+            args, _ = m.call_args
+
+            command = args[0]
+
+            self.assertEqual(command[0], "qpdf")
+            self.assertEqual(command[1], "--replace-input")
+
  
  @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
  class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 851fe62172e622aa4b16079d5a54448692c1be46..2da0b49f1db9f27d5efa05e87441e726c079c966 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -960,6 +960,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
      "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
  )
  
+CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
+
  OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
  
  # The default language that tesseract will attempt to use when parsing
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 25 Sep 2024 15:57:20 +0000 (08:57 -0700)
committer	GitHub <noreply@github.com>
	Wed, 25 Sep 2024 15:57:20 +0000 (15:57 +0000)
docs/advanced_usage.md		patch \| blob \| blame \| history
src/documents/consumer.py		patch \| blob \| blame \| history
src/documents/serialisers.py		patch \| blob \| blame \| history
src/documents/tests/samples/invalid_pdf.pdf	[new file with mode: 0644]	patch \| blob
src/documents/tests/test_api_documents.py		patch \| blob \| blame \| history
src/documents/tests/test_consumer.py		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history