Cleans up and improves parser discovery testing, simplifies the determination of...

author Trenton H <797416+stumpylog@users.noreply.github.com>

Wed, 4 Jan 2023 18:18:31 +0000 (10:18 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Thu, 5 Jan 2023 16:39:48 +0000 (08:39 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Wed, 4 Jan 2023 18:18:31 +0000 (10:18 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Thu, 5 Jan 2023 16:39:48 +0000 (08:39 -0800)
diff --git a/src/documents/parsers.py b/src/documents/parsers.py

index e2309b366521aaeef24194d451d6c8885f42c1be..240d60e7fd4c99ace767d92b8a5161b8b80b527b 100644 (file)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,12 +6,12 @@ import re
  import shutil
  import subprocess
  import tempfile
+from functools import cache
  from typing import Iterator
  from typing import Match
  from typing import Optional
  from typing import Set
  
-import magic
  from django.conf import settings
  from django.utils import timezone
  from documents.loggers import LoggingMixin
@@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
  logger = logging.getLogger("paperless.parsing")
  
  
-def is_mime_type_supported(mime_type) -> bool:
+@cache
+def is_mime_type_supported(mime_type: str) -> bool:
+    """
+    Returns True if the mime type is supported, False otherwise
+    """
      return get_parser_class_for_mime_type(mime_type) is not None
  
  
-def get_default_file_extension(mime_type) -> str:
+@cache
+def get_default_file_extension(mime_type: str) -> str:
+    """
+    Returns the default file extension for a mimetype, or
+    an empty string if it could not be determined
+    """
      for response in document_consumer_declaration.send(None):
          parser_declaration = response[1]
          supported_mime_types = parser_declaration["mime_types"]
@@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
          return ""
  
  
-def is_file_ext_supported(ext) -> bool:
+@cache
+def is_file_ext_supported(ext: str) -> bool:
+    """
+    Returns True if the file extension is supported, False otherwise
+    TODO: Investigate why this really exists, why not use mimetype
+    """
      if ext:
          return ext.lower() in get_supported_file_extensions()
      else:
@@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
  
          for mime_type in supported_mime_types:
              extensions.update(mimetypes.guess_all_extensions(mime_type))
+            # Python's stdlib might be behind, so also add what the parser
+            # says is the default extension
+            # This makes image/webp supported on Python < 3.11
+            extensions.add(supported_mime_types[mime_type])
  
      return extensions
  
  
-def get_parser_class_for_mime_type(mime_type):
+def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
+    """
+    Returns the best parser (by weight) for the given mimetype or
+    None if no parser exists
+    """
  
      options = []
  
@@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
      return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
  
  
-def get_parser_class(path):
-    """
-    Determine the appropriate parser class based on the file
-    """
-
-    mime_type = magic.from_file(path, mime=True)
-
-    return get_parser_class_for_mime_type(mime_type)
-
-
  def run_convert(
      input_file,
      output_file,
diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py

index 1942fe0dde931128e7b6c413a82570d9b95c3112..8ba2c70ee09eca8d5eb4613d89ee51adb4d1d00a 100644 (file)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,14 +1,8 @@
-import os
-import shutil
-import tempfile
  from tempfile import TemporaryDirectory
  from unittest import mock
  
-from django.test import override_settings
  from django.test import TestCase
-from documents.parsers import DocumentParser
  from documents.parsers import get_default_file_extension
-from documents.parsers import get_parser_class
  from documents.parsers import get_parser_class_for_mime_type
  from documents.parsers import get_supported_file_extensions
  from documents.parsers import is_file_ext_supported
@@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
  from paperless_text.parsers import TextDocumentParser
  
  
-def fake_magic_from_file(file, mime=False):
-
-    if mime:
-        if os.path.splitext(file)[1] == ".pdf":
-            return "application/pdf"
-        else:
-            return "unknown"
-    else:
-        return "A verbose string that describes the contents of the file"
-
-
-@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
  class TestParserDiscovery(TestCase):
      @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_1_parser(self, m, *args):
+    def test_get_parser_class_1_parser(self, m, *args):
+        """
+        GIVEN:
+            - Parser declared for a given mimetype
+        WHEN:
+            - Attempt to get parser for the mimetype
+        THEN:
+            - Declared parser class is returned
+        """
+
          class DummyParser:
              pass
  
@@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
              ),
          )
  
-        self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
+        self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
  
      @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_n_parsers(self, m, *args):
+    def test_get_parser_class_n_parsers(self, m, *args):
+        """
+        GIVEN:
+            - Two parsers declared for a given mimetype
+            - Second parser has a higher weight
+        WHEN:
+            - Attempt to get parser for the mimetype
+        THEN:
+            - Second parser class is returned
+        """
+
          class DummyParser1:
              pass
  
@@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
              ),
          )
  
-        self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
+        self.assertEqual(
+            get_parser_class_for_mime_type("application/pdf"),
+            DummyParser2,
+        )
  
      @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_0_parsers(self, m, *args):
+    def test_get_parser_class_0_parsers(self, m, *args):
+        """
+        GIVEN:
+            - No parsers are declared
+        WHEN:
+            - Attempt to get parser for the mimetype
+        THEN:
+            - No parser class is returned
+        """
          m.return_value = []
          with TemporaryDirectory() as tmpdir:
-            self.assertIsNone(get_parser_class("doc.pdf"))
+            self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
  
+    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    def test_get_parser_class_no_valid_parser(self, m, *args):
+        """
+        GIVEN:
+            - No parser declared for a given mimetype
+            - Parser declared for a different mimetype
+        WHEN:
+            - Attempt to get parser for the given mimetype
+        THEN:
+            - No parser class is returned
+        """
  
-def fake_get_thumbnail(self, path, mimetype, file_name):
-    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+        class DummyParser:
+            pass
+
+        m.return_value = (
+            (
+                None,
+                {
+                    "weight": 0,
+                    "parser": DummyParser,
+                    "mime_types": {"application/pdf": ".pdf"},
+                },
+            ),
+        )
+
+        self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
  
  
  class TestParserAvailability(TestCase):
      def test_file_extensions(self):
  
-        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
-            self.assertIn(ext, get_supported_file_extensions())
-        self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
-        self.assertEqual(get_default_file_extension("image/png"), ".png")
-        self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
-        self.assertEqual(get_default_file_extension("text/plain"), ".txt")
-        self.assertEqual(get_default_file_extension("text/csv"), ".csv")
+        supported_mimes_and_exts = [
+            ("application/pdf", ".pdf"),
+            ("image/png", ".png"),
+            ("image/jpeg", ".jpg"),
+            ("image/tiff", ".tif"),
+            ("image/webp", ".webp"),
+            ("text/plain", ".txt"),
+            ("text/csv", ".csv"),
+        ]
+
+        supported_exts = get_supported_file_extensions()
+
+        for mime_type, ext in supported_mimes_and_exts:
+            self.assertIn(ext, supported_exts)
+            self.assertEqual(get_default_file_extension(mime_type), ext)
+
+        # Test no parser declared still returns a an extension
          self.assertEqual(get_default_file_extension("application/zip"), ".zip")
+
+        # Test invalid mimetype returns no extension
          self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
  
          self.assertIsInstance(
@@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
              get_parser_class_for_mime_type("text/plain")(logging_group=None),
              TextDocumentParser,
          )
-        self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
+        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
  
          self.assertTrue(is_file_ext_supported(".pdf"))
          self.assertFalse(is_file_ext_supported(".hsdfh"))
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Wed, 4 Jan 2023 18:18:31 +0000 (10:18 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Thu, 5 Jan 2023 16:39:48 +0000 (08:39 -0800)
src/documents/parsers.py		patch \| blob \| blame \| history
src/documents/tests/test_parsers.py		patch \| blob \| blame \| history