import shutil
import subprocess
import tempfile
+from functools import cache
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
-import magic
from django.conf import settings
from django.utils import timezone
from documents.loggers import LoggingMixin
logger = logging.getLogger("paperless.parsing")
-def is_mime_type_supported(mime_type) -> bool:
+@cache
+def is_mime_type_supported(mime_type: str) -> bool:
+ """
+ Returns True if the mime type is supported, False otherwise
+ """
return get_parser_class_for_mime_type(mime_type) is not None
-def get_default_file_extension(mime_type) -> str:
+@cache
+def get_default_file_extension(mime_type: str) -> str:
+ """
+ Returns the default file extension for a mimetype, or
+ an empty string if it could not be determined
+ """
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
return ""
-def is_file_ext_supported(ext) -> bool:
+@cache
+def is_file_ext_supported(ext: str) -> bool:
+ """
+ Returns True if the file extension is supported, False otherwise
+ TODO: Investigate why this really exists, why not use mimetype
+ """
if ext:
return ext.lower() in get_supported_file_extensions()
else:
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
+ # Python's stdlib might be behind, so also add what the parser
+ # says is the default extension
+ # This makes image/webp supported on Python < 3.11
+ extensions.add(supported_mime_types[mime_type])
return extensions
-def get_parser_class_for_mime_type(mime_type):
+def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
+ """
+ Returns the best parser (by weight) for the given mimetype or
+ None if no parser exists
+ """
options = []
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
-def get_parser_class(path):
- """
- Determine the appropriate parser class based on the file
- """
-
- mime_type = magic.from_file(path, mime=True)
-
- return get_parser_class_for_mime_type(mime_type)
-
-
def run_convert(
input_file,
output_file,
-import os
-import shutil
-import tempfile
from tempfile import TemporaryDirectory
from unittest import mock
-from django.test import override_settings
from django.test import TestCase
-from documents.parsers import DocumentParser
from documents.parsers import get_default_file_extension
-from documents.parsers import get_parser_class
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
from paperless_text.parsers import TextDocumentParser
-def fake_magic_from_file(file, mime=False):
-
- if mime:
- if os.path.splitext(file)[1] == ".pdf":
- return "application/pdf"
- else:
- return "unknown"
- else:
- return "A verbose string that describes the contents of the file"
-
-
-@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
- def test__get_parser_class_1_parser(self, m, *args):
+ def test_get_parser_class_1_parser(self, m, *args):
+ """
+ GIVEN:
+ - Parser declared for a given mimetype
+ WHEN:
+ - Attempt to get parser for the mimetype
+ THEN:
+ - Declared parser class is returned
+ """
+
class DummyParser:
pass
),
)
- self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
+ self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
@mock.patch("documents.parsers.document_consumer_declaration.send")
- def test__get_parser_class_n_parsers(self, m, *args):
+ def test_get_parser_class_n_parsers(self, m, *args):
+ """
+ GIVEN:
+ - Two parsers declared for a given mimetype
+ - Second parser has a higher weight
+ WHEN:
+ - Attempt to get parser for the mimetype
+ THEN:
+ - Second parser class is returned
+ """
+
class DummyParser1:
pass
),
)
- self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
+ self.assertEqual(
+ get_parser_class_for_mime_type("application/pdf"),
+ DummyParser2,
+ )
@mock.patch("documents.parsers.document_consumer_declaration.send")
- def test__get_parser_class_0_parsers(self, m, *args):
+ def test_get_parser_class_0_parsers(self, m, *args):
+ """
+ GIVEN:
+ - No parsers are declared
+ WHEN:
+ - Attempt to get parser for the mimetype
+ THEN:
+ - No parser class is returned
+ """
m.return_value = []
with TemporaryDirectory() as tmpdir:
- self.assertIsNone(get_parser_class("doc.pdf"))
+ self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
+ @mock.patch("documents.parsers.document_consumer_declaration.send")
+ def test_get_parser_class_no_valid_parser(self, m, *args):
+ """
+ GIVEN:
+ - No parser declared for a given mimetype
+ - Parser declared for a different mimetype
+ WHEN:
+ - Attempt to get parser for the given mimetype
+ THEN:
+ - No parser class is returned
+ """
-def fake_get_thumbnail(self, path, mimetype, file_name):
- return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+ class DummyParser:
+ pass
+
+ m.return_value = (
+ (
+ None,
+ {
+ "weight": 0,
+ "parser": DummyParser,
+ "mime_types": {"application/pdf": ".pdf"},
+ },
+ ),
+ )
+
+ self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
- for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
- self.assertIn(ext, get_supported_file_extensions())
- self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
- self.assertEqual(get_default_file_extension("image/png"), ".png")
- self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
- self.assertEqual(get_default_file_extension("text/plain"), ".txt")
- self.assertEqual(get_default_file_extension("text/csv"), ".csv")
+ supported_mimes_and_exts = [
+ ("application/pdf", ".pdf"),
+ ("image/png", ".png"),
+ ("image/jpeg", ".jpg"),
+ ("image/tiff", ".tif"),
+ ("image/webp", ".webp"),
+ ("text/plain", ".txt"),
+ ("text/csv", ".csv"),
+ ]
+
+ supported_exts = get_supported_file_extensions()
+
+ for mime_type, ext in supported_mimes_and_exts:
+ self.assertIn(ext, supported_exts)
+ self.assertEqual(get_default_file_extension(mime_type), ext)
+
+ # Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
+
+ # Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
self.assertIsInstance(
get_parser_class_for_mime_type("text/plain")(logging_group=None),
TextDocumentParser,
)
- self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
+ self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh"))