Defining a storage path is optional. If no storage path is defined for a
document, the global [`PAPERLESS_FILENAME_FORMAT`](configuration.md#PAPERLESS_FILENAME_FORMAT) is applied.
+## Automatic recovery of invalid PDFs {#pdf-recovery}
+
+Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
+detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
+
## Celery Monitoring {#celery-monitoring}
The monitoring tool
)
self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+ self.unmodified_original = None
# Determine the parser class.
self.log.debug(f"Detected mime type: {mime_type}")
+ if (
+ Path(self.filename).suffix.lower() == ".pdf"
+ and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ ):
+ try:
+ # The file might be a pdf, but the mime type is wrong.
+ # Try to clean with qpdf
+ self.log.debug(
+ "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+ )
+ run_subprocess(
+ [
+ "qpdf",
+ "--replace-input",
+ self.working_copy,
+ ],
+ logger=self.log,
+ )
+ mime_type = magic.from_file(self.working_copy, mime=True)
+ self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+ # Save the original file for later
+ self.unmodified_original = (
+ Path(tempdir.name) / Path("uo") / Path(self.filename)
+ )
+ copy_file_with_basic_stats(
+ self.input_doc.original_file,
+ self.unmodified_original,
+ )
+ except Exception as e:
+ self.log.error(f"Error attempting to clean PDF: {e}")
+
# Based on the mime type, get the parser for that type
parser_class: Optional[type[DocumentParser]] = (
get_parser_class_for_mime_type(
self._write(
document.storage_type,
- self.working_copy,
+ self.unmodified_original
+ if self.unmodified_original is not None
+ else self.working_copy,
document.source_path,
)
self.log.debug(f"Deleting file {self.working_copy}")
self.input_doc.original_file.unlink()
self.working_copy.unlink()
+ if self.unmodified_original is not None: # pragma: no cover
+ self.unmodified_original.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
mime_type = magic.from_buffer(document_data, mime=True)
if not is_mime_type_supported(mime_type):
- raise serializers.ValidationError(
- _("File type %(type)s not supported") % {"type": mime_type},
- )
+ if (
+ mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ and document.name.endswith(
+ ".pdf",
+ )
+ ):
+ # If the file is an invalid PDF, we can try to recover it later in the consumer
+ mime_type = "application/pdf"
+ else:
+ raise serializers.ValidationError(
+ _("File type %(type)s not supported") % {"type": mime_type},
+ )
return document.name, document_data
self.assertEqual(overrides.filename, "simple.pdf")
self.assertEqual(overrides.custom_field_ids, [custom_field.id])
+ def test_upload_invalid_pdf(self):
+ """
+ GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ WHEN: Upload the file
+ THEN: The file is not rejected
+ """
+ self.consume_file_mock.return_value = celery.result.AsyncResult(
+ id=str(uuid.uuid4()),
+ )
+
+ with open(
+ os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
+ "rb",
+ ) as f:
+ response = self.client.post(
+ "/api/documents/post_document/",
+ {"document": f},
+ )
+
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+
def test_get_metadata(self):
doc = Document.objects.create(
title="test",
def fake_magic_from_file(file, mime=False):
if mime:
+ if file.name.startswith("invalid_pdf"):
+ return "application/octet-stream"
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
elif os.path.splitext(file)[1] == ".png":
sanity_check()
+ @mock.patch("documents.consumer.run_subprocess")
+ def test_try_to_clean_invalid_pdf(self, m):
+ shutil.copy(
+ Path(__file__).parent / "samples" / "invalid_pdf.pdf",
+ settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+ )
+ with self.get_consumer(
+ settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+ ) as consumer:
+ # fails because no qpdf
+ self.assertRaises(ConsumerError, consumer.run)
+
+ m.assert_called_once()
+
+ args, _ = m.call_args
+
+ command = args[0]
+
+ self.assertEqual(command[0], "qpdf")
+ self.assertEqual(command[1], "--replace-input")
+
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
+CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
+
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
# The default language that tesseract will attempt to use when parsing