]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Feature: Enable users to customize date parsing via plugins (#11931)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Tue, 3 Feb 2026 20:09:13 +0000 (12:09 -0800)
committerGitHub <noreply@github.com>
Tue, 3 Feb 2026 20:09:13 +0000 (20:09 +0000)
14 files changed:
docs/development.md
pyproject.toml
src/documents/consumer.py
src/documents/parsers.py
src/documents/plugins/date_parsing/__init__.py [new file with mode: 0644]
src/documents/plugins/date_parsing/base.py [new file with mode: 0644]
src/documents/plugins/date_parsing/regex_parser.py [new file with mode: 0644]
src/documents/tests/date_parsing/__init__.py [new file with mode: 0644]
src/documents/tests/date_parsing/conftest.py [new file with mode: 0644]
src/documents/tests/date_parsing/test_date_parser_plugin_loading.py [new file with mode: 0644]
src/documents/tests/date_parsing/test_date_parsing.py [new file with mode: 0644]
src/documents/tests/test_api_documents.py
src/documents/tests/test_date_parsing.py [deleted file]
src/documents/views.py

index de328e1f8e3ee783260f4456f0adf20ec7c94467..9d3b1460a1c0333bb4a2dbadc3aa1dca8246e563 100644 (file)
@@ -481,3 +481,147 @@ To get started:
 
 5. The project is ready for debugging, start either run the fullstack debug or individual debug
    processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
+
+## Developing Date Parser Plugins
+
+Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+
+### Creating a Date Parser Plugin
+
+To create a custom date parser plugin, you need to:
+
+1. Create a class that inherits from `DateParserPluginBase`
+2. Implement the required abstract method
+3. Register your plugin via an entry point
+
+#### 1. Implementing the Parser Class
+
+Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
+
+```python
+from collections.abc import Iterator
+import datetime
+
+from documents.plugins.date_parsing import DateParserPluginBase
+
+
+class MyDateParserPlugin(DateParserPluginBase):
+    """
+    Custom date parser implementation.
+    """
+
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        """
+        Parse dates from the document's filename and content.
+
+        Args:
+            filename: The original filename of the document
+            content: The extracted text content of the document
+
+        Yields:
+            datetime.datetime: Valid datetime objects found in the document
+        """
+        # Your parsing logic here
+        # Use self.config to access configuration settings
+
+        # Example: parse dates from filename first
+        if self.config.filename_date_order:
+            # Your filename parsing logic
+            yield some_datetime
+
+        # Then parse dates from content
+        # Your content parsing logic
+        yield another_datetime
+```
+
+#### 2. Configuration and Helper Methods
+
+Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
+
+-   `languages: list[str]` - List of language codes for date parsing
+-   `timezone_str: str` - Timezone string for date localization
+-   `ignore_dates: set[datetime.date]` - Dates that should be filtered out
+-   `reference_time: datetime.datetime` - Current time for filtering future dates
+-   `filename_date_order: str | None` - Date order preference for filenames (e.g., "DMY", "MDY")
+-   `content_date_order: str` - Date order preference for content
+
+The base class provides two helper methods you can use:
+
+```python
+def _parse_string(
+    self,
+    date_string: str,
+    date_order: str,
+) -> datetime.datetime | None:
+    """
+    Parse a single date string using dateparser with configured settings.
+    """
+
+def _filter_date(
+    self,
+    date: datetime.datetime | None,
+) -> datetime.datetime | None:
+    """
+    Validate a parsed datetime against configured rules.
+    Filters out dates before 1900, future dates, and ignored dates.
+    """
+```
+
+#### 3. Resource Management (Optional)
+
+If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
+
+#### 4. Registering Your Plugin
+
+Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
+
+```toml
+[project.entry-points."paperless_ngx.date_parsers"]
+my_parser = "my_package.parsers:MyDateParserPlugin"
+```
+
+The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
+
+### Plugin Discovery
+
+Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
+
+1. Queries the `paperless_ngx.date_parsers` entry point group
+2. Validates that each plugin is a subclass of `DateParserPluginBase`
+3. Sorts valid plugins alphabetically by entry point name
+4. Uses the first valid plugin, or falls back to the default `RegexDateParserPlugin` if none are found
+
+If multiple plugins are installed, a warning is logged indicating which plugin was selected.
+
+### Example: Simple Date Parser
+
+Here's a minimal example that only looks for ISO 8601 dates:
+
+```python
+import datetime
+import re
+from collections.abc import Iterator
+
+from documents.plugins.date_parsing.base import DateParserPluginBase
+
+
+class ISODateParserPlugin(DateParserPluginBase):
+    """
+    Parser that only matches ISO 8601 formatted dates (YYYY-MM-DD).
+    """
+
+    ISO_REGEX = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
+
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        # Combine filename and content for searching
+        text = f"{filename} {content}"
+
+        for match in self.ISO_REGEX.finditer(text):
+            date_string = match.group(1)
+            # Use helper method to parse with configured timezone
+            date = self._parse_string(date_string, "YMD")
+            # Use helper method to validate the date
+            filtered_date = self._filter_date(date)
+            if filtered_date is not None:
+                yield filtered_date
+```
index b9bbca96b0a2a53b049673bb0575971624f694dc..1faa435f1bfb6c21804f26e372005c17bfd736bf 100644 (file)
@@ -306,6 +306,7 @@ markers = [
   "gotenberg: Tests requiring Gotenberg service",
   "tika: Tests requiring Tika service",
   "greenmail: Tests requiring Greenmail service",
+  "date_parsing: Tests which cover date parsing from content or filename",
 ]
 
 [tool.pytest_env]
@@ -332,6 +333,10 @@ exclude_also = [
 
 [tool.mypy]
 mypy_path = "src"
+files = [
+  "src/documents/plugins/date_parsing",
+  "src/documents/tests/date_parsing",
+]
 plugins = [
   "mypy_django_plugin.main",
   "mypy_drf_plugin.main",
@@ -343,5 +348,28 @@ disallow_untyped_defs = true
 warn_redundant_casts = true
 warn_unused_ignores = true
 
+# This prevents errors from imports, but allows type-checking logic to work
+follow_imports = "silent"
+
+[[tool.mypy.overrides]]
+module = [
+  "documents.*",
+  "paperless.*",
+  "paperless_ai.*",
+  "paperless_mail.*",
+  "paperless_tesseract.*",
+  "paperless_remote.*",
+  "paperless_text.*",
+  "paperless_tika.*",
+]
+ignore_errors = true
+
+[[tool.mypy.overrides]]
+module = [
+  "documents.plugins.date_parsing.*",
+  "documents.tests.date_parsing.*",
+]
+ignore_errors = false
+
 [tool.django-stubs]
 django_settings_module = "paperless.settings"
index d9a149ed518bf1cd35950371c8a4e106d3f9f354..10a95201527a14f850710a84221e9fe7b3f67b7c 100644 (file)
@@ -33,12 +33,12 @@ from documents.models import WorkflowTrigger
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import parse_date
 from documents.permissions import set_permissions_for_object
 from documents.plugins.base import AlwaysRunPluginMixin
 from documents.plugins.base import ConsumeTaskPlugin
 from documents.plugins.base import NoCleanupPluginMixin
 from documents.plugins.base import NoSetupPluginMixin
+from documents.plugins.date_parsing import get_date_parser
 from documents.plugins.helpers import ProgressManager
 from documents.plugins.helpers import ProgressStatusOptions
 from documents.signals import document_consumption_finished
@@ -432,7 +432,8 @@ class ConsumerPlugin(
                     ProgressStatusOptions.WORKING,
                     ConsumerStatusShortMessage.PARSE_DATE,
                 )
-                date = parse_date(self.filename, text)
+                with get_date_parser() as date_parser:
+                    date = next(date_parser.parse(self.filename, text), None)
             archive_path = document_parser.get_archive_path()
             page_count = document_parser.get_page_count(self.working_copy, mime_type)
 
index 211fb61fe98a8a28647d111d4f2baf14d2f197d3..b59e7d6b798349dcd63b1593751a8261722fa6ef 100644 (file)
@@ -9,22 +9,17 @@ import subprocess
 import tempfile
 from functools import lru_cache
 from pathlib import Path
-from re import Match
 from typing import TYPE_CHECKING
 
 from django.conf import settings
-from django.utils import timezone
 
 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
-from paperless.config import OcrConfig
-from paperless.utils import ocr_to_dateparser_languages
 
 if TYPE_CHECKING:
     import datetime
-    from collections.abc import Iterator
 
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
     return out_path
 
 
-def parse_date(filename, text) -> datetime.datetime | None:
-    return next(parse_date_generator(filename, text), None)
-
-
-def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
-    """
-    Returns the date of the document.
-    """
-
-    def __parser(ds: str, date_order: str) -> datetime.datetime:
-        """
-        Call dateparser.parse with a particular date ordering
-        """
-        import dateparser
-
-        ocr_config = OcrConfig()
-        languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
-            ocr_config.language,
-        )
-
-        return dateparser.parse(
-            ds,
-            settings={
-                "DATE_ORDER": date_order,
-                "PREFER_DAY_OF_MONTH": "first",
-                "RETURN_AS_TIMEZONE_AWARE": True,
-                "TIMEZONE": settings.TIME_ZONE,
-            },
-            locales=languages,
-        )
-
-    def __filter(date: datetime.datetime) -> datetime.datetime | None:
-        if (
-            date is not None
-            and date.year > 1900
-            and date <= timezone.now()
-            and date.date() not in settings.IGNORE_DATES
-        ):
-            return date
-        return None
-
-    def __process_match(
-        match: Match[str],
-        date_order: str,
-    ) -> datetime.datetime | None:
-        date_string = match.group(0)
-
-        try:
-            date = __parser(date_string, date_order)
-        except Exception:
-            # Skip all matches that do not parse to a proper date
-            date = None
-
-        return __filter(date)
-
-    def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
-        for m in re.finditer(DATE_REGEX, content):
-            date = __process_match(m, date_order)
-            if date is not None:
-                yield date
-
-    # if filename date parsing is enabled, search there first:
-    if settings.FILENAME_DATE_ORDER:
-        yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
-
-    # Iterate through all regex matches in text and try to parse the date
-    yield from __process_content(text, settings.DATE_ORDER)
-
-
 class ParseError(Exception):
     pass
 
diff --git a/src/documents/plugins/date_parsing/__init__.py b/src/documents/plugins/date_parsing/__init__.py
new file mode 100644 (file)
index 0000000..2eec1e2
--- /dev/null
@@ -0,0 +1,101 @@
+import logging
+from functools import lru_cache
+from importlib.metadata import EntryPoint
+from importlib.metadata import entry_points
+from typing import Final
+
+from django.conf import settings
+from django.utils import timezone
+
+from documents.plugins.date_parsing.base import DateParserConfig
+from documents.plugins.date_parsing.base import DateParserPluginBase
+from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
+from paperless.config import OcrConfig
+from paperless.utils import ocr_to_dateparser_languages
+
+logger = logging.getLogger(__name__)
+
+DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
+
+
+@lru_cache(maxsize=1)
+def _discover_parser_class() -> type[DateParserPluginBase]:
+    """
+    Discovers the date parser plugin class to use.
+
+    - If one or more plugins are found, sorts them by name and returns the first.
+    - If no plugins are found, returns the default RegexDateParser.
+    """
+
+    eps: tuple[EntryPoint, ...]
+    try:
+        eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
+    except Exception as e:
+        # Log a warning
+        logger.warning(f"Could not query entry points for date parsers: {e}")
+        eps = ()
+
+    valid_plugins: list[EntryPoint] = []
+    for ep in eps:
+        try:
+            plugin_class = ep.load()
+            if plugin_class and issubclass(plugin_class, DateParserPluginBase):
+                valid_plugins.append(ep)
+            else:
+                logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
+        except Exception as e:
+            logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
+
+    if not valid_plugins:
+        return RegexDateParserPlugin
+
+    valid_plugins.sort(key=lambda ep: ep.name)
+
+    if len(valid_plugins) > 1:
+        logger.warning(
+            f"Multiple date parsers found: "
+            f"{[ep.name for ep in valid_plugins]}. "
+            f"Using the first one by name: '{valid_plugins[0].name}'.",
+        )
+
+    return valid_plugins[0].load()
+
+
+def get_date_parser() -> DateParserPluginBase:
+    """
+    Factory function to get an initialized date parser instance.
+
+    This function is responsible for:
+    1. Discovering the correct parser class (plugin or default).
+    2. Loading configuration from Django settings.
+    3. Instantiating the parser with the configuration.
+    """
+    # 1. Discover the class (this is cached)
+    parser_class = _discover_parser_class()
+
+    # 2. Load configuration from settings
+    # TODO: Get the language from the settings and/or configuration object, depending
+    ocr_config = OcrConfig()
+    languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
+        ocr_config.language,
+    )
+
+    config = DateParserConfig(
+        languages=languages,
+        timezone_str=settings.TIME_ZONE,
+        ignore_dates=settings.IGNORE_DATES,
+        reference_time=timezone.now(),
+        filename_date_order=settings.FILENAME_DATE_ORDER,
+        content_date_order=settings.DATE_ORDER,
+    )
+
+    # 3. Instantiate the discovered class with the config
+    return parser_class(config=config)
+
+
+__all__ = [
+    "DateParserConfig",
+    "DateParserPluginBase",
+    "RegexDateParserPlugin",
+    "get_date_parser",
+]
diff --git a/src/documents/plugins/date_parsing/base.py b/src/documents/plugins/date_parsing/base.py
new file mode 100644 (file)
index 0000000..c6df1a7
--- /dev/null
@@ -0,0 +1,124 @@
+import datetime
+import logging
+from abc import ABC
+from abc import abstractmethod
+from collections.abc import Iterator
+from dataclasses import dataclass
+from types import TracebackType
+
+try:
+    from typing import Self
+except ImportError:
+    from typing_extensions import Self
+
+import dateparser
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True, slots=True)
+class DateParserConfig:
+    """
+    Configuration for a DateParser instance.
+
+    This object is created by the factory and passed to the
+    parser's constructor, decoupling the parser from settings.
+    """
+
+    languages: list[str]
+    timezone_str: str
+    ignore_dates: set[datetime.date]
+
+    # A "now" timestamp for filtering future dates.
+    # Passed in by the factory.
+    reference_time: datetime.datetime
+
+    # Settings for the default RegexDateParser
+    # Other plugins should use or consider these, but it is not required
+    filename_date_order: str | None
+    content_date_order: str
+
+
+class DateParserPluginBase(ABC):
+    """
+    Abstract base class for date parsing strategies.
+
+    Instances are configured via a DateParserConfig object.
+    """
+
+    def __init__(self, config: DateParserConfig):
+        """
+        Initializes the parser with its configuration.
+        """
+        self.config = config
+
+    def __enter__(self) -> Self:
+        """
+        Enter the runtime context related to this object.
+
+        Subclasses can override this to acquire resources (connections, handles).
+        """
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        """
+        Exit the runtime context related to this object.
+
+        Subclasses can override this to release resources.
+        """
+        # Default implementation does nothing.
+        # Returning None implies exceptions are propagated.
+
+    def _parse_string(
+        self,
+        date_string: str,
+        date_order: str,
+    ) -> datetime.datetime | None:
+        """
+        Helper method to parse a single date string using dateparser.
+
+        Uses configuration from `self.config`.
+        """
+        try:
+            return dateparser.parse(
+                date_string,
+                settings={
+                    "DATE_ORDER": date_order,
+                    "PREFER_DAY_OF_MONTH": "first",
+                    "RETURN_AS_TIMEZONE_AWARE": True,
+                    "TIMEZONE": self.config.timezone_str,
+                },
+                locales=self.config.languages,
+            )
+        except Exception as e:
+            logger.error(f"Error while parsing date string '{date_string}': {e}")
+            return None
+
+    def _filter_date(
+        self,
+        date: datetime.datetime | None,
+    ) -> datetime.datetime | None:
+        """
+        Helper method to validate a parsed datetime object.
+
+        Uses configuration from `self.config`.
+        """
+        if (
+            date is not None
+            and date.year > 1900
+            and date <= self.config.reference_time
+            and date.date() not in self.config.ignore_dates
+        ):
+            return date
+        return None
+
+    @abstractmethod
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        """
+        Parses a document's filename and content, yielding valid datetime objects.
+        """
diff --git a/src/documents/plugins/date_parsing/regex_parser.py b/src/documents/plugins/date_parsing/regex_parser.py
new file mode 100644 (file)
index 0000000..2df8f92
--- /dev/null
@@ -0,0 +1,65 @@
+import datetime
+import re
+from collections.abc import Iterator
+from re import Match
+
+from documents.plugins.date_parsing.base import DateParserPluginBase
+
+
+class RegexDateParserPlugin(DateParserPluginBase):
+    """
+    The default date parser, using a series of regular expressions.
+
+    It is configured entirely by the DateParserConfig object
+    passed to its constructor.
+    """
+
+    DATE_REGEX = re.compile(
+        r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
+        r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
+        re.IGNORECASE,
+    )
+
+    def _process_match(
+        self,
+        match: Match[str],
+        date_order: str,
+    ) -> datetime.datetime | None:
+        """
+        Processes a single regex match using the base class helpers.
+        """
+        date_string = match.group(0)
+        date = self._parse_string(date_string, date_order)
+        return self._filter_date(date)
+
+    def _process_content(
+        self,
+        content: str,
+        date_order: str,
+    ) -> Iterator[datetime.datetime]:
+        """
+        Finds all regex matches in content and yields valid dates.
+        """
+        for m in re.finditer(self.DATE_REGEX, content):
+            date = self._process_match(m, date_order)
+            if date is not None:
+                yield date
+
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        """
+        Implementation of the abstract parse method.
+
+        Reads its configuration from `self.config`.
+        """
+        if self.config.filename_date_order:
+            yield from self._process_content(
+                filename,
+                self.config.filename_date_order,
+            )
+
+        yield from self._process_content(content, self.config.content_date_order)
diff --git a/src/documents/tests/date_parsing/__init__.py b/src/documents/tests/date_parsing/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/documents/tests/date_parsing/conftest.py b/src/documents/tests/date_parsing/conftest.py
new file mode 100644 (file)
index 0000000..ea9e244
--- /dev/null
@@ -0,0 +1,82 @@
+import datetime
+from collections.abc import Generator
+from typing import Any
+
+import pytest
+import pytest_django
+
+from documents.plugins.date_parsing import _discover_parser_class
+from documents.plugins.date_parsing.base import DateParserConfig
+from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
+
+
+@pytest.fixture
+def base_config() -> DateParserConfig:
+    """Basic configuration for date parser testing."""
+    return DateParserConfig(
+        languages=["en"],
+        timezone_str="UTC",
+        ignore_dates=set(),
+        reference_time=datetime.datetime(
+            2024,
+            1,
+            15,
+            12,
+            0,
+            0,
+            tzinfo=datetime.timezone.utc,
+        ),
+        filename_date_order="YMD",
+        content_date_order="DMY",
+    )
+
+
+@pytest.fixture
+def config_with_ignore_dates() -> DateParserConfig:
+    """Configuration with dates to ignore."""
+    return DateParserConfig(
+        languages=["en", "de"],
+        timezone_str="America/New_York",
+        ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
+        reference_time=datetime.datetime(
+            2024,
+            1,
+            15,
+            12,
+            0,
+            0,
+            tzinfo=datetime.timezone.utc,
+        ),
+        filename_date_order="DMY",
+        content_date_order="MDY",
+    )
+
+
+@pytest.fixture
+def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
+    """Instance of RegexDateParser with base config."""
+    return RegexDateParserPlugin(base_config)
+
+
+@pytest.fixture
+def clear_lru_cache() -> Generator[None, None, None]:
+    """
+    Ensure the LRU cache for _discover_parser_class is cleared
+    before and after any test that depends on it.
+    """
+    _discover_parser_class.cache_clear()
+    yield
+    _discover_parser_class.cache_clear()
+
+
+@pytest.fixture
+def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
+    """
+    Override Django settings for the duration of date parser tests.
+    """
+    settings.DATE_PARSER_LANGUAGES = ["en", "de"]
+    settings.TIME_ZONE = "UTC"
+    settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
+    settings.FILENAME_DATE_ORDER = "YMD"
+    settings.DATE_ORDER = "DMY"
+    return settings
diff --git a/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py b/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py
new file mode 100644 (file)
index 0000000..5d870fe
--- /dev/null
@@ -0,0 +1,229 @@
+import datetime
+import logging
+from collections.abc import Iterator
+from importlib.metadata import EntryPoint
+
+import pytest
+import pytest_mock
+from django.utils import timezone
+
+from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
+from documents.plugins.date_parsing import _discover_parser_class
+from documents.plugins.date_parsing import get_date_parser
+from documents.plugins.date_parsing.base import DateParserConfig
+from documents.plugins.date_parsing.base import DateParserPluginBase
+from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
+
+
+class AlphaParser(DateParserPluginBase):
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        yield timezone.now()
+
+
+class BetaParser(DateParserPluginBase):
+    def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
+        yield timezone.now()
+
+
+@pytest.mark.date_parsing
+@pytest.mark.usefixtures("clear_lru_cache")
+class TestDiscoverParserClass:
+    """Tests for the _discover_parser_class() function."""
+
+    def test_returns_default_when_no_plugins_found(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(),
+        )
+        result = _discover_parser_class()
+        assert result is RegexDateParserPlugin
+
+    def test_returns_default_when_entrypoint_query_fails(
+        self,
+        mocker: pytest_mock.MockerFixture,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            side_effect=RuntimeError("boom"),
+        )
+        result = _discover_parser_class()
+        assert result is RegexDateParserPlugin
+        assert "Could not query entry points" in caplog.text
+
+    def test_filters_out_invalid_plugins(
+        self,
+        mocker: pytest_mock.MockerFixture,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        fake_ep = mocker.MagicMock(spec=EntryPoint)
+        fake_ep.name = "bad_plugin"
+        fake_ep.load.return_value = object  # not subclass of DateParser
+
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(fake_ep,),
+        )
+
+        result = _discover_parser_class()
+        assert result is RegexDateParserPlugin
+        assert "does not subclass DateParser" in caplog.text
+
+    def test_skips_plugins_that_fail_to_load(
+        self,
+        mocker: pytest_mock.MockerFixture,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        fake_ep = mocker.MagicMock(spec=EntryPoint)
+        fake_ep.name = "failing_plugin"
+        fake_ep.load.side_effect = ImportError("cannot import")
+
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(fake_ep,),
+        )
+
+        result = _discover_parser_class()
+        assert result is RegexDateParserPlugin
+        assert "Unable to load date parser plugin failing_plugin" in caplog.text
+
+    def test_returns_single_valid_plugin_without_warning(
+        self,
+        mocker: pytest_mock.MockerFixture,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """If exactly one valid plugin is discovered, it should be returned without logging a warning."""
+
+        ep = mocker.MagicMock(spec=EntryPoint)
+        ep.name = "alpha"
+        ep.load.return_value = AlphaParser
+
+        mock_entry_points = mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(ep,),
+        )
+
+        with caplog.at_level(
+            logging.WARNING,
+            logger="documents.plugins.date_parsing",
+        ):
+            result = _discover_parser_class()
+
+        # It should have called entry_points with the correct group
+        mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
+
+        # The discovered class should be exactly our AlphaParser
+        assert result is AlphaParser
+
+        # No warnings should have been logged
+        assert not any(
+            "Multiple date parsers found" in record.message for record in caplog.records
+        ), "Unexpected warning logged when only one plugin was found"
+
+    def test_returns_first_valid_plugin_by_name(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        ep_a = mocker.MagicMock(spec=EntryPoint)
+        ep_a.name = "alpha"
+        ep_a.load.return_value = AlphaParser
+
+        ep_b = mocker.MagicMock(spec=EntryPoint)
+        ep_b.name = "beta"
+        ep_b.load.return_value = BetaParser
+
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(ep_b, ep_a),
+        )
+
+        result = _discover_parser_class()
+        assert result is AlphaParser
+
+    def test_logs_warning_if_multiple_plugins_found(
+        self,
+        mocker: pytest_mock.MockerFixture,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        ep1 = mocker.MagicMock(spec=EntryPoint)
+        ep1.name = "a"
+        ep1.load.return_value = AlphaParser
+
+        ep2 = mocker.MagicMock(spec=EntryPoint)
+        ep2.name = "b"
+        ep2.load.return_value = BetaParser
+
+        mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(ep1, ep2),
+        )
+
+        with caplog.at_level(
+            logging.WARNING,
+            logger="documents.plugins.date_parsing",
+        ):
+            result = _discover_parser_class()
+
+        # Should select alphabetically first plugin ("a")
+        assert result is AlphaParser
+
+        # Should log a warning mentioning multiple parsers
+        assert any(
+            "Multiple date parsers found" in record.message for record in caplog.records
+        ), "Expected a warning about multiple date parsers"
+
+    def test_cache_behavior_only_runs_once(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        mock_entry_points = mocker.patch(
+            "documents.plugins.date_parsing.entry_points",
+            return_value=(),
+        )
+
+        # First call populates cache
+        _discover_parser_class()
+        # Second call should not re-invoke entry_points
+        _discover_parser_class()
+        mock_entry_points.assert_called_once()
+
+
+@pytest.mark.django_db
+@pytest.mark.date_parsing
+@pytest.mark.usefixtures("mock_date_parser_settings")
+class TestGetDateParser:
+    """Tests for the get_date_parser() factory function."""
+
+    def test_returns_instance_of_discovered_class(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        mocker.patch(
+            "documents.plugins.date_parsing._discover_parser_class",
+            return_value=AlphaParser,
+        )
+        parser = get_date_parser()
+        assert isinstance(parser, AlphaParser)
+        assert isinstance(parser.config, DateParserConfig)
+        assert parser.config.languages == ["en", "de"]
+        assert parser.config.timezone_str == "UTC"
+        assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
+        assert parser.config.filename_date_order == "YMD"
+        assert parser.config.content_date_order == "DMY"
+        # Check reference_time near now
+        delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
+        assert delta < 2
+
+    def test_uses_default_regex_parser_when_no_plugins(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        mocker.patch(
+            "documents.plugins.date_parsing._discover_parser_class",
+            return_value=RegexDateParserPlugin,
+        )
+        parser = get_date_parser()
+        assert isinstance(parser, RegexDateParserPlugin)
diff --git a/src/documents/tests/date_parsing/test_date_parsing.py b/src/documents/tests/date_parsing/test_date_parsing.py
new file mode 100644 (file)
index 0000000..a587b32
--- /dev/null
@@ -0,0 +1,433 @@
+import datetime
+import logging
+from typing import Any
+
+import pytest
+import pytest_mock
+
+from documents.plugins.date_parsing.base import DateParserConfig
+from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
+
+
+@pytest.mark.date_parsing
+class TestParseString:
+    """Tests for DateParser._parse_string method via RegexDateParser."""
+
+    @pytest.mark.parametrize(
+        ("date_string", "date_order", "expected_year"),
+        [
+            pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
+            pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
+            pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
+            pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
+            pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
+            pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
+            pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
+        ],
+    )
+    def test_parse_string_valid_formats(
+        self,
+        regex_parser: RegexDateParserPlugin,
+        date_string: str,
+        date_order: str,
+        expected_year: int,
+    ) -> None:
+        """Should correctly parse various valid date formats."""
+        result = regex_parser._parse_string(date_string, date_order)
+
+        assert result is not None
+        assert result.year == expected_year
+
+    @pytest.mark.parametrize(
+        "invalid_string",
+        [
+            pytest.param("not a date", id="plain_text"),
+            pytest.param("32/13/2024", id="invalid_day_month"),
+            pytest.param("", id="empty_string"),
+            pytest.param("abc123xyz", id="alphanumeric_gibberish"),
+            pytest.param("99/99/9999", id="out_of_range"),
+        ],
+    )
+    def test_parse_string_invalid_input(
+        self,
+        regex_parser: RegexDateParserPlugin,
+        invalid_string: str,
+    ) -> None:
+        """Should return None for invalid date strings."""
+        result = regex_parser._parse_string(invalid_string, "DMY")
+
+        assert result is None
+
+    def test_parse_string_handles_exceptions(
+        self,
+        caplog: pytest.LogCaptureFixture,
+        mocker: pytest_mock.MockerFixture,
+        regex_parser: RegexDateParserPlugin,
+    ) -> None:
+        """Should handle and log exceptions from dateparser gracefully."""
+        with caplog.at_level(
+            logging.ERROR,
+            logger="documents.plugins.date_parsing.base",
+        ):
+            # We still need to mock dateparser.parse to force the exception
+            mocker.patch(
+                "documents.plugins.date_parsing.base.dateparser.parse",
+                side_effect=ValueError(
+                    "Parsing error: 01/01/2024",
+                ),
+            )
+
+            # 1. Execute the function under test
+            result = regex_parser._parse_string("01/01/2024", "DMY")
+
+            assert result is None
+
+            # Check if an error was logged
+            assert len(caplog.records) == 1
+            assert caplog.records[0].levelname == "ERROR"
+
+            # Check if the specific error message is present
+            assert "Error while parsing date string" in caplog.text
+            # Optional: Check for the exact exception message if it's included in the log
+            assert "Parsing error: 01/01/2024" in caplog.text
+
+
+@pytest.mark.date_parsing
+class TestFilterDate:
+    """Tests for DateParser._filter_date method via RegexDateParser."""
+
+    @pytest.mark.parametrize(
+        ("date", "expected_output"),
+        [
+            # Valid Dates
+            pytest.param(
+                datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
+                datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
+                id="valid_past_date",
+            ),
+            pytest.param(
+                datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
+                datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
+                id="exactly_at_reference",
+            ),
+            pytest.param(
+                datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
+                datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
+                id="year_1901_valid",
+            ),
+            # Date is > reference_time
+            pytest.param(
+                datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
+                None,
+                id="future_date_day_after",
+            ),
+            # date.date() in ignore_dates
+            pytest.param(
+                datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
+                None,
+                id="ignored_date_midnight_jan1",
+            ),
+            pytest.param(
+                datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
+                None,
+                id="ignored_date_midday_jan1",
+            ),
+            pytest.param(
+                datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
+                None,
+                id="ignored_date_dec25_future",
+            ),
+            # date.year <= 1900
+            pytest.param(
+                datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
+                None,
+                id="year_1899",
+            ),
+            pytest.param(
+                datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
+                None,
+                id="year_1900_boundary",
+            ),
+            # date is None
+            pytest.param(None, None, id="none_input"),
+        ],
+    )
+    def test_filter_date_validation_rules(
+        self,
+        config_with_ignore_dates: DateParserConfig,
+        date: datetime.datetime | None,
+        expected_output: datetime.datetime | None,
+    ) -> None:
+        """Should correctly validate dates against various rules."""
+        parser = RegexDateParserPlugin(config_with_ignore_dates)
+        result = parser._filter_date(date)
+        assert result == expected_output
+
+    def test_filter_date_respects_ignore_dates(
+        self,
+        config_with_ignore_dates: DateParserConfig,
+    ) -> None:
+        """Should filter out dates in the ignore_dates set."""
+        parser = RegexDateParserPlugin(config_with_ignore_dates)
+
+        ignored_date = datetime.datetime(
+            2024,
+            1,
+            1,
+            12,
+            0,
+            tzinfo=datetime.timezone.utc,
+        )
+        another_ignored = datetime.datetime(
+            2024,
+            12,
+            25,
+            15,
+            30,
+            tzinfo=datetime.timezone.utc,
+        )
+        allowed_date = datetime.datetime(
+            2024,
+            1,
+            2,
+            12,
+            0,
+            tzinfo=datetime.timezone.utc,
+        )
+
+        assert parser._filter_date(ignored_date) is None
+        assert parser._filter_date(another_ignored) is None
+        assert parser._filter_date(allowed_date) == allowed_date
+
+    def test_filter_date_timezone_aware(
+        self,
+        regex_parser: RegexDateParserPlugin,
+    ) -> None:
+        """Should work with timezone-aware datetimes."""
+        date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
+
+        result = regex_parser._filter_date(date_utc)
+
+        assert result is not None
+        assert result.tzinfo is not None
+
+
+@pytest.mark.date_parsing
+class TestRegexDateParser:
+    @pytest.mark.parametrize(
+        ("filename", "content", "expected"),
+        [
+            pytest.param(
+                "report-2023-12-25.txt",
+                "Event recorded on 25/12/2022.",
+                [
+                    datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
+                    datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
+                ],
+                id="filename-y-m-d_and_content-d-m-y",
+            ),
+            pytest.param(
+                "img_2023.01.02.jpg",
+                "Taken on 01/02/2023",
+                [
+                    datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
+                    datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
+                ],
+                id="ambiguous-dates-respect-orders",
+            ),
+            pytest.param(
+                "notes.txt",
+                "bad date 99/99/9999 and 25/12/2022",
+                [
+                    datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
+                ],
+                id="parse-exception-skips-bad-and-yields-good",
+            ),
+        ],
+    )
+    def test_parse_returns_expected_dates(
+        self,
+        base_config: DateParserConfig,
+        mocker: pytest_mock.MockerFixture,
+        filename: str,
+        content: str,
+        expected: list[datetime.datetime],
+    ) -> None:
+        """
+        High-level tests that exercise RegexDateParser.parse only.
+        dateparser.parse is mocked so tests are deterministic.
+        """
+        parser = RegexDateParserPlugin(base_config)
+
+        # Patch the dateparser.parse
+        target = "documents.plugins.date_parsing.base.dateparser.parse"
+
+        def fake_parse(
+            date_string: str,
+            settings: dict[str, Any] | None = None,
+            locales: None = None,
+        ) -> datetime.datetime | None:
+            date_order = settings.get("DATE_ORDER") if settings else None
+
+            # Filename-style YYYY-MM-DD / YYYY.MM.DD
+            if (
+                "2023-12-25" in date_string
+                or "2023.12.25" in date_string
+                or "2023-12-25" in date_string
+            ):
+                return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
+
+            # content DMY 25/12/2022
+            if "25/12/2022" in date_string or "25-12-2022" in date_string:
+                return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
+
+            # filename YMD 2023.01.02
+            if "2023.01.02" in date_string or "2023-01-02" in date_string:
+                return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
+
+            # ambiguous 01/02/2023 -> respect DATE_ORDER setting
+            if "01/02/2023" in date_string:
+                if date_order == "DMY":
+                    return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
+                if date_order == "YMD":
+                    return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
+                # fallback
+                return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
+
+            # simulate parse failure for malformed input
+            if "99/99/9999" in date_string or "bad date" in date_string:
+                raise Exception("parse failed for malformed date")
+
+            return None
+
+        mocker.patch(target, side_effect=fake_parse)
+
+        results = list(parser.parse(filename, content))
+
+        assert results == expected
+        for dt in results:
+            assert dt.tzinfo is not None
+
+    def test_parse_filters_future_and_ignored_dates(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        """
+        Ensure parser filters out:
+          - dates after reference_time
+          - dates whose .date() are in ignore_dates
+        """
+        cfg = DateParserConfig(
+            languages=["en"],
+            timezone_str="UTC",
+            ignore_dates={datetime.date(2023, 12, 10)},
+            reference_time=datetime.datetime(
+                2024,
+                1,
+                15,
+                12,
+                0,
+                0,
+                tzinfo=datetime.timezone.utc,
+            ),
+            filename_date_order="YMD",
+            content_date_order="DMY",
+        )
+        parser = RegexDateParserPlugin(cfg)
+
+        target = "documents.plugins.date_parsing.base.dateparser.parse"
+
+        def fake_parse(
+            date_string: str,
+            settings: dict[str, Any] | None = None,
+            locales: None = None,
+        ) -> datetime.datetime | None:
+            if "10/12/2023" in date_string or "10-12-2023" in date_string:
+                # ignored date
+                return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
+            if "01/02/2024" in date_string or "01-02-2024" in date_string:
+                # future relative to reference_time -> filtered
+                return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
+            if "05/01/2023" in date_string or "05-01-2023" in date_string:
+                # valid
+                return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
+            return None
+
+        mocker.patch(target, side_effect=fake_parse)
+
+        content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
+        results = list(parser.parse("whatever.txt", content))
+
+        assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
+
+    def test_parse_handles_no_matches_and_returns_empty_list(
+        self,
+        base_config: DateParserConfig,
+    ) -> None:
+        """
+        When there are no matching date-like substrings, parse should yield nothing.
+        """
+        parser = RegexDateParserPlugin(base_config)
+        results = list(
+            parser.parse("no-dates.txt", "this has no dates whatsoever"),
+        )
+        assert results == []
+
+    def test_parse_skips_filename_when_filename_date_order_none(
+        self,
+        mocker: pytest_mock.MockerFixture,
+    ) -> None:
+        """
+        When filename_date_order is None the parser must not attempt to parse the filename.
+        Only dates found in the content should be passed to dateparser.parse.
+        """
+        cfg = DateParserConfig(
+            languages=["en"],
+            timezone_str="UTC",
+            ignore_dates=set(),
+            reference_time=datetime.datetime(
+                2024,
+                1,
+                15,
+                12,
+                0,
+                0,
+                tzinfo=datetime.timezone.utc,
+            ),
+            filename_date_order=None,
+            content_date_order="DMY",
+        )
+        parser = RegexDateParserPlugin(cfg)
+
+        # Patch the module's dateparser.parse so we can inspect calls
+        target = "documents.plugins.date_parsing.base.dateparser.parse"
+
+        def fake_parse(
+            date_string: str,
+            settings: dict[str, Any] | None = None,
+            locales: None = None,
+        ) -> datetime.datetime | None:
+            # return distinct datetimes so we can tell which source was parsed
+            if "25/12/2022" in date_string:
+                return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
+            if "2023-12-25" in date_string:
+                return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
+            return None
+
+        mock = mocker.patch(target, side_effect=fake_parse)
+
+        filename = "report-2023-12-25.txt"
+        content = "Event recorded on 25/12/2022."
+
+        results = list(parser.parse(filename, content))
+
+        # Only the content date should have been parsed -> one call
+        assert mock.call_count == 1
+
+        # # first call, first positional arg
+        called_date_string = mock.call_args_list[0][0][0]
+        assert "25/12/2022" in called_date_string
+        # And the parser should have yielded the corresponding datetime
+        assert results == [
+            datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
+        ]
index e3ccd8e4e3067ac955d73150452814a9a5970db2..c362f96465b7304f1a3efdbd8da87aaa8e87470a 100644 (file)
@@ -1989,11 +1989,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
         self.assertEqual(response.status_code, status.HTTP_200_OK)
 
-    @mock.patch("documents.parsers.parse_date_generator")
+    @mock.patch("documents.views.get_date_parser")
     @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
     def test_get_suggestions_dates_disabled(
         self,
-        parse_date_generator,
+        mock_get_date_parser: mock.MagicMock,
     ):
         """
         GIVEN:
@@ -2010,7 +2010,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         )
 
         self.client.get(f"/api/documents/{doc.pk}/suggestions/")
-        self.assertFalse(parse_date_generator.called)
+
+        mock_get_date_parser.assert_not_called()
 
     def test_saved_views(self) -> None:
         u1 = User.objects.create_superuser("user1")
diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py
deleted file mode 100644 (file)
index c9c1fb8..0000000
+++ /dev/null
@@ -1,538 +0,0 @@
-import datetime
-from zoneinfo import ZoneInfo
-
-import pytest
-from pytest_django.fixtures import SettingsWrapper
-
-from documents.parsers import parse_date
-from documents.parsers import parse_date_generator
-
-
-@pytest.mark.django_db()
-class TestDate:
-    def test_date_format_1(self) -> None:
-        text = "lorem ipsum 130218 lorem ipsum"
-        assert parse_date("", text) is None
-
-    def test_date_format_2(self) -> None:
-        text = "lorem ipsum 2018 lorem ipsum"
-        assert parse_date("", text) is None
-
-    def test_date_format_3(self) -> None:
-        text = "lorem ipsum 20180213 lorem ipsum"
-        assert parse_date("", text) is None
-
-    def test_date_format_4(self, settings_timezone: ZoneInfo) -> None:
-        text = "lorem ipsum 13.02.2018 lorem ipsum"
-        date = parse_date("", text)
-        assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
-
-    def test_date_format_5(self, settings_timezone: ZoneInfo) -> None:
-        text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
-        date = parse_date("", text)
-        assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
-
-    def test_date_format_6(self) -> None:
-        text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum"
-        )
-        assert parse_date("", text) is None
-
-    def test_date_format_7(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        settings.DATE_PARSER_LANGUAGES = ["de"]
-        text = "lorem ipsum\nMärz 2019\nlorem ipsum"
-        date = parse_date("", text)
-        assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
-
-    def test_date_format_8(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        settings.DATE_PARSER_LANGUAGES = ["de"]
-        text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum\n"
-            "März 2020"
-        )
-        assert parse_date("", text) == datetime.datetime(
-            2020,
-            3,
-            1,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_9(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        settings.DATE_PARSER_LANGUAGES = ["de"]
-        text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
-        assert parse_date("", text) == datetime.datetime(
-            2020,
-            3,
-            1,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_10(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_11(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_12(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_13(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_14(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_15(self) -> None:
-        text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
-        assert parse_date("", text) is None
-
-    def test_date_format_16(self) -> None:
-        text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
-        assert parse_date("", text) is None
-
-    def test_date_format_17(self) -> None:
-        text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
-        assert parse_date("", text) is None
-
-    def test_date_format_18(self) -> None:
-        text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
-        assert parse_date("", text) is None
-
-    def test_date_format_19(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            21,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_20(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            22,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_21(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            2,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_22(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            23,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_23(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            24,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_24(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            21,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_25(self, settings_timezone: ZoneInfo) -> None:
-        text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
-        assert parse_date("", text) == datetime.datetime(
-            2022,
-            3,
-            25,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_date_format_26(self, settings_timezone: ZoneInfo) -> None:
-        text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
-        assert parse_date("", text) == datetime.datetime(
-            2019,
-            9,
-            25,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_crazy_date_past(self) -> None:
-        assert parse_date("", "01-07-0590 00:00:00") is None
-
-    def test_crazy_date_future(self) -> None:
-        assert parse_date("", "01-07-2350 00:00:00") is None
-
-    def test_crazy_date_with_spaces(self) -> None:
-        assert parse_date("", "20 408000l 2475") is None
-
-    def test_utf_month_names(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
-        assert parse_date("", "13 décembre 2023") == datetime.datetime(
-            2023,
-            12,
-            13,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "13 août 2022") == datetime.datetime(
-            2022,
-            8,
-            13,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "11 März 2020") == datetime.datetime(
-            2020,
-            3,
-            11,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
-            2018,
-            3,
-            17,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "1. veljače 2016.") == datetime.datetime(
-            2016,
-            2,
-            1,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "15. února 1985") == datetime.datetime(
-            1985,
-            2,
-            15,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "30. září 2011") == datetime.datetime(
-            2011,
-            9,
-            30,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "28. května 1990") == datetime.datetime(
-            1990,
-            5,
-            28,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "1. grudzień 1997") == datetime.datetime(
-            1997,
-            12,
-            1,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "17 Şubat 2024") == datetime.datetime(
-            2024,
-            2,
-            17,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
-            2012,
-            8,
-            30,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "17 Eylül 2000") == datetime.datetime(
-            2000,
-            9,
-            17,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-        assert parse_date("", "5. október 1992") == datetime.datetime(
-            1992,
-            10,
-            5,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_multiple_dates(self, settings_timezone: ZoneInfo) -> None:
-        text = """This text has multiple dates.
-                  For example 02.02.2018, 22 July 2022 and December 2021.
-                  But not 24-12-9999 because it's in the future..."""
-        dates = list(parse_date_generator("", text))
-
-        assert dates == [
-            datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
-            datetime.datetime(
-                2022,
-                7,
-                22,
-                0,
-                0,
-                tzinfo=settings_timezone,
-            ),
-            datetime.datetime(
-                2021,
-                12,
-                1,
-                0,
-                0,
-                tzinfo=settings_timezone,
-            ),
-        ]
-
-    def test_filename_date_parse_valid_ymd(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        """
-        GIVEN:
-            - Date parsing from the filename is enabled
-            - Filename date format is with Year Month Day (YMD)
-            - Filename contains date matching the format
-
-        THEN:
-            - Should parse the date from the filename
-        """
-        settings.FILENAME_DATE_ORDER = "YMD"
-
-        assert parse_date(
-            "/tmp/Scan-2022-04-01.pdf",
-            "No date in here",
-        ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
-
-    def test_filename_date_parse_valid_dmy(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        """
-        GIVEN:
-            - Date parsing from the filename is enabled
-            - Filename date format is with Day Month Year (DMY)
-            - Filename contains date matching the format
-
-        THEN:
-            - Should parse the date from the filename
-        """
-        settings.FILENAME_DATE_ORDER = "DMY"
-        assert parse_date(
-            "/tmp/Scan-10.01.2021.pdf",
-            "No date in here",
-        ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
-
-    def test_filename_date_parse_invalid(self, settings: SettingsWrapper) -> None:
-        """
-        GIVEN:
-            - Date parsing from the filename is enabled
-            - Filename includes no date
-            - File content includes no date
-
-        THEN:
-            - No date is parsed
-        """
-        settings.FILENAME_DATE_ORDER = "YMD"
-        assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
-
-    def test_filename_date_ignored_use_content(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        """
-        GIVEN:
-            - Date parsing from the filename is enabled
-            - Filename date format is with Day Month Year (YMD)
-            - Date order is Day Month Year (DMY, the default)
-            - Filename contains date matching the format
-            - Filename date is an ignored date
-            - File content includes a date
-
-        THEN:
-            - Should parse the date from the content not filename
-        """
-        settings.FILENAME_DATE_ORDER = "YMD"
-        settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
-        assert parse_date(
-            "/tmp/Scan-2022-04-01.pdf",
-            "The matching date is 24.03.2022",
-        ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
-
-    def test_ignored_dates_default_order(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        """
-        GIVEN:
-            - Ignore dates have been set
-            - File content includes ignored dates
-            - File content includes 1 non-ignored date
-
-        THEN:
-            - Should parse the date non-ignored date from content
-        """
-        settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
-        text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
-        assert parse_date("", text) == datetime.datetime(
-            2018,
-            2,
-            13,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
-
-    def test_ignored_dates_order_ymd(
-        self,
-        settings: SettingsWrapper,
-        settings_timezone: ZoneInfo,
-    ) -> None:
-        """
-        GIVEN:
-            - Ignore dates have been set
-            - Date order is Year Month Date (YMD)
-            - File content includes ignored dates
-            - File content includes 1 non-ignored date
-
-        THEN:
-            - Should parse the date non-ignored date from content
-        """
-
-        settings.FILENAME_DATE_ORDER = "YMD"
-        settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
-
-        text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
-
-        assert parse_date("", text) == datetime.datetime(
-            2018,
-            2,
-            13,
-            0,
-            0,
-            tzinfo=settings_timezone,
-        )
index fbd72b10d13395cdb89a58619b42094d50169346..c634c007e3d62eb36c86ec6e7a190c98584cf3b6 100644 (file)
@@ -148,7 +148,6 @@ from documents.models import Workflow
 from documents.models import WorkflowAction
 from documents.models import WorkflowTrigger
 from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import parse_date_generator
 from documents.permissions import AcknowledgeTasksPermissions
 from documents.permissions import PaperlessAdminPermissions
 from documents.permissions import PaperlessNotePermissions
@@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user
 from documents.permissions import get_objects_for_user_owner_aware
 from documents.permissions import has_perms_owner_aware
 from documents.permissions import set_permissions_for_object
+from documents.plugins.date_parsing import get_date_parser
 from documents.schema import generate_object_with_permissions_schema
 from documents.serialisers import AcknowledgeTasksViewSerializer
 from documents.serialisers import BulkDownloadSerializer
@@ -1023,16 +1023,17 @@ class DocumentViewSet(
 
             dates = []
             if settings.NUMBER_OF_SUGGESTED_DATES > 0:
-                gen = parse_date_generator(doc.filename, doc.content)
-                dates = sorted(
-                    {
-                        i
-                        for i in itertools.islice(
-                            gen,
-                            settings.NUMBER_OF_SUGGESTED_DATES,
-                        )
-                    },
-                )
+                with get_date_parser() as date_parser:
+                    gen = date_parser.parse(doc.filename, doc.content)
+                    dates = sorted(
+                        {
+                            i
+                            for i in itertools.islice(
+                                gen,
+                                settings.NUMBER_OF_SUGGESTED_DATES,
+                            )
+                        },
+                    )
 
             resp_data = {
                 "correspondents": [