From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 3 Feb 2026 20:09:13 +0000 (-0800) Subject: Feature: Enable users to customize date parsing via plugins (#11931) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2ec8ec96c806d08389755f26cd3546975d99dc56;p=thirdparty%2Fpaperless-ngx.git Feature: Enable users to customize date parsing via plugins (#11931) --- diff --git a/docs/development.md b/docs/development.md index de328e1f8e..9d3b1460a1 100644 --- a/docs/development.md +++ b/docs/development.md @@ -481,3 +481,147 @@ To get started: 5. The project is ready for debugging, start either run the fullstack debug or individual debug processes. Yo spin up the project without debugging run the task **Project Start: Run all Services** + +## Developing Date Parser Plugins + +Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). + +### Creating a Date Parser Plugin + +To create a custom date parser plugin, you need to: + +1. Create a class that inherits from `DateParserPluginBase` +2. Implement the required abstract method +3. Register your plugin via an entry point + +#### 1. Implementing the Parser Class + +Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method: + +```python +from collections.abc import Iterator +import datetime + +from documents.plugins.date_parsing import DateParserPluginBase + + +class MyDateParserPlugin(DateParserPluginBase): + """ + Custom date parser implementation. + """ + + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + """ + Parse dates from the document's filename and content. + + Args: + filename: The original filename of the document + content: The extracted text content of the document + + Yields: + datetime.datetime: Valid datetime objects found in the document + """ + # Your parsing logic here + # Use self.config to access configuration settings + + # Example: parse dates from filename first + if self.config.filename_date_order: + # Your filename parsing logic + yield some_datetime + + # Then parse dates from content + # Your content parsing logic + yield another_datetime +``` + +#### 2. Configuration and Helper Methods + +Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides: + +- `languages: list[str]` - List of language codes for date parsing +- `timezone_str: str` - Timezone string for date localization +- `ignore_dates: set[datetime.date]` - Dates that should be filtered out +- `reference_time: datetime.datetime` - Current time for filtering future dates +- `filename_date_order: str | None` - Date order preference for filenames (e.g., "DMY", "MDY") +- `content_date_order: str` - Date order preference for content + +The base class provides two helper methods you can use: + +```python +def _parse_string( + self, + date_string: str, + date_order: str, +) -> datetime.datetime | None: + """ + Parse a single date string using dateparser with configured settings. + """ + +def _filter_date( + self, + date: datetime.datetime | None, +) -> datetime.datetime | None: + """ + Validate a parsed datetime against configured rules. + Filters out dates before 1900, future dates, and ignored dates. + """ +``` + +#### 3. Resource Management (Optional) + +If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors. + +#### 4. Registering Your Plugin + +Register your plugin using a setuptools entry point in your package's `pyproject.toml`: + +```toml +[project.entry-points."paperless_ngx.date_parsers"] +my_parser = "my_package.parsers:MyDateParserPlugin" +``` + +The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered. + +### Plugin Discovery + +Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process: + +1. Queries the `paperless_ngx.date_parsers` entry point group +2. Validates that each plugin is a subclass of `DateParserPluginBase` +3. Sorts valid plugins alphabetically by entry point name +4. Uses the first valid plugin, or falls back to the default `RegexDateParserPlugin` if none are found + +If multiple plugins are installed, a warning is logged indicating which plugin was selected. + +### Example: Simple Date Parser + +Here's a minimal example that only looks for ISO 8601 dates: + +```python +import datetime +import re +from collections.abc import Iterator + +from documents.plugins.date_parsing.base import DateParserPluginBase + + +class ISODateParserPlugin(DateParserPluginBase): + """ + Parser that only matches ISO 8601 formatted dates (YYYY-MM-DD). + """ + + ISO_REGEX = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b") + + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + # Combine filename and content for searching + text = f"{filename} {content}" + + for match in self.ISO_REGEX.finditer(text): + date_string = match.group(1) + # Use helper method to parse with configured timezone + date = self._parse_string(date_string, "YMD") + # Use helper method to validate the date + filtered_date = self._filter_date(date) + if filtered_date is not None: + yield filtered_date +``` diff --git a/pyproject.toml b/pyproject.toml index b9bbca96b0..1faa435f1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -306,6 +306,7 @@ markers = [ "gotenberg: Tests requiring Gotenberg service", "tika: Tests requiring Tika service", "greenmail: Tests requiring Greenmail service", + "date_parsing: Tests which cover date parsing from content or filename", ] [tool.pytest_env] @@ -332,6 +333,10 @@ exclude_also = [ [tool.mypy] mypy_path = "src" +files = [ + "src/documents/plugins/date_parsing", + "src/documents/tests/date_parsing", +] plugins = [ "mypy_django_plugin.main", "mypy_drf_plugin.main", @@ -343,5 +348,28 @@ disallow_untyped_defs = true warn_redundant_casts = true warn_unused_ignores = true +# This prevents errors from imports, but allows type-checking logic to work +follow_imports = "silent" + +[[tool.mypy.overrides]] +module = [ + "documents.*", + "paperless.*", + "paperless_ai.*", + "paperless_mail.*", + "paperless_tesseract.*", + "paperless_remote.*", + "paperless_text.*", + "paperless_tika.*", +] +ignore_errors = true + +[[tool.mypy.overrides]] +module = [ + "documents.plugins.date_parsing.*", + "documents.tests.date_parsing.*", +] +ignore_errors = false + [tool.django-stubs] django_settings_module = "paperless.settings" diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d9a149ed51..10a9520152 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -33,12 +33,12 @@ from documents.models import WorkflowTrigger from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import parse_date from documents.permissions import set_permissions_for_object from documents.plugins.base import AlwaysRunPluginMixin from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import NoCleanupPluginMixin from documents.plugins.base import NoSetupPluginMixin +from documents.plugins.date_parsing import get_date_parser from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressStatusOptions from documents.signals import document_consumption_finished @@ -432,7 +432,8 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.PARSE_DATE, ) - date = parse_date(self.filename, text) + with get_date_parser() as date_parser: + date = next(date_parser.parse(self.filename, text), None) archive_path = document_parser.get_archive_path() page_count = document_parser.get_page_count(self.working_copy, mime_type) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 211fb61fe9..b59e7d6b79 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -9,22 +9,17 @@ import subprocess import tempfile from functools import lru_cache from pathlib import Path -from re import Match from typing import TYPE_CHECKING from django.conf import settings -from django.utils import timezone from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess -from paperless.config import OcrConfig -from paperless.utils import ocr_to_dateparser_languages if TYPE_CHECKING: import datetime - from collections.abc import Iterator # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) - return out_path -def parse_date(filename, text) -> datetime.datetime | None: - return next(parse_date_generator(filename, text), None) - - -def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: - """ - Returns the date of the document. - """ - - def __parser(ds: str, date_order: str) -> datetime.datetime: - """ - Call dateparser.parse with a particular date ordering - """ - import dateparser - - ocr_config = OcrConfig() - languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages( - ocr_config.language, - ) - - return dateparser.parse( - ds, - settings={ - "DATE_ORDER": date_order, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": True, - "TIMEZONE": settings.TIME_ZONE, - }, - locales=languages, - ) - - def __filter(date: datetime.datetime) -> datetime.datetime | None: - if ( - date is not None - and date.year > 1900 - and date <= timezone.now() - and date.date() not in settings.IGNORE_DATES - ): - return date - return None - - def __process_match( - match: Match[str], - date_order: str, - ) -> datetime.datetime | None: - date_string = match.group(0) - - try: - date = __parser(date_string, date_order) - except Exception: - # Skip all matches that do not parse to a proper date - date = None - - return __filter(date) - - def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]: - for m in re.finditer(DATE_REGEX, content): - date = __process_match(m, date_order) - if date is not None: - yield date - - # if filename date parsing is enabled, search there first: - if settings.FILENAME_DATE_ORDER: - yield from __process_content(filename, settings.FILENAME_DATE_ORDER) - - # Iterate through all regex matches in text and try to parse the date - yield from __process_content(text, settings.DATE_ORDER) - - class ParseError(Exception): pass diff --git a/src/documents/plugins/date_parsing/__init__.py b/src/documents/plugins/date_parsing/__init__.py new file mode 100644 index 0000000000..2eec1e2425 --- /dev/null +++ b/src/documents/plugins/date_parsing/__init__.py @@ -0,0 +1,101 @@ +import logging +from functools import lru_cache +from importlib.metadata import EntryPoint +from importlib.metadata import entry_points +from typing import Final + +from django.conf import settings +from django.utils import timezone + +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.base import DateParserPluginBase +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin +from paperless.config import OcrConfig +from paperless.utils import ocr_to_dateparser_languages + +logger = logging.getLogger(__name__) + +DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers" + + +@lru_cache(maxsize=1) +def _discover_parser_class() -> type[DateParserPluginBase]: + """ + Discovers the date parser plugin class to use. + + - If one or more plugins are found, sorts them by name and returns the first. + - If no plugins are found, returns the default RegexDateParser. + """ + + eps: tuple[EntryPoint, ...] + try: + eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP) + except Exception as e: + # Log a warning + logger.warning(f"Could not query entry points for date parsers: {e}") + eps = () + + valid_plugins: list[EntryPoint] = [] + for ep in eps: + try: + plugin_class = ep.load() + if plugin_class and issubclass(plugin_class, DateParserPluginBase): + valid_plugins.append(ep) + else: + logger.warning(f"Plugin {ep.name} does not subclass DateParser.") + except Exception as e: + logger.error(f"Unable to load date parser plugin {ep.name}: {e}") + + if not valid_plugins: + return RegexDateParserPlugin + + valid_plugins.sort(key=lambda ep: ep.name) + + if len(valid_plugins) > 1: + logger.warning( + f"Multiple date parsers found: " + f"{[ep.name for ep in valid_plugins]}. " + f"Using the first one by name: '{valid_plugins[0].name}'.", + ) + + return valid_plugins[0].load() + + +def get_date_parser() -> DateParserPluginBase: + """ + Factory function to get an initialized date parser instance. + + This function is responsible for: + 1. Discovering the correct parser class (plugin or default). + 2. Loading configuration from Django settings. + 3. Instantiating the parser with the configuration. + """ + # 1. Discover the class (this is cached) + parser_class = _discover_parser_class() + + # 2. Load configuration from settings + # TODO: Get the language from the settings and/or configuration object, depending + ocr_config = OcrConfig() + languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages( + ocr_config.language, + ) + + config = DateParserConfig( + languages=languages, + timezone_str=settings.TIME_ZONE, + ignore_dates=settings.IGNORE_DATES, + reference_time=timezone.now(), + filename_date_order=settings.FILENAME_DATE_ORDER, + content_date_order=settings.DATE_ORDER, + ) + + # 3. Instantiate the discovered class with the config + return parser_class(config=config) + + +__all__ = [ + "DateParserConfig", + "DateParserPluginBase", + "RegexDateParserPlugin", + "get_date_parser", +] diff --git a/src/documents/plugins/date_parsing/base.py b/src/documents/plugins/date_parsing/base.py new file mode 100644 index 0000000000..c6df1a70f2 --- /dev/null +++ b/src/documents/plugins/date_parsing/base.py @@ -0,0 +1,124 @@ +import datetime +import logging +from abc import ABC +from abc import abstractmethod +from collections.abc import Iterator +from dataclasses import dataclass +from types import TracebackType + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +import dateparser + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class DateParserConfig: + """ + Configuration for a DateParser instance. + + This object is created by the factory and passed to the + parser's constructor, decoupling the parser from settings. + """ + + languages: list[str] + timezone_str: str + ignore_dates: set[datetime.date] + + # A "now" timestamp for filtering future dates. + # Passed in by the factory. + reference_time: datetime.datetime + + # Settings for the default RegexDateParser + # Other plugins should use or consider these, but it is not required + filename_date_order: str | None + content_date_order: str + + +class DateParserPluginBase(ABC): + """ + Abstract base class for date parsing strategies. + + Instances are configured via a DateParserConfig object. + """ + + def __init__(self, config: DateParserConfig): + """ + Initializes the parser with its configuration. + """ + self.config = config + + def __enter__(self) -> Self: + """ + Enter the runtime context related to this object. + + Subclasses can override this to acquire resources (connections, handles). + """ + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """ + Exit the runtime context related to this object. + + Subclasses can override this to release resources. + """ + # Default implementation does nothing. + # Returning None implies exceptions are propagated. + + def _parse_string( + self, + date_string: str, + date_order: str, + ) -> datetime.datetime | None: + """ + Helper method to parse a single date string using dateparser. + + Uses configuration from `self.config`. + """ + try: + return dateparser.parse( + date_string, + settings={ + "DATE_ORDER": date_order, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": True, + "TIMEZONE": self.config.timezone_str, + }, + locales=self.config.languages, + ) + except Exception as e: + logger.error(f"Error while parsing date string '{date_string}': {e}") + return None + + def _filter_date( + self, + date: datetime.datetime | None, + ) -> datetime.datetime | None: + """ + Helper method to validate a parsed datetime object. + + Uses configuration from `self.config`. + """ + if ( + date is not None + and date.year > 1900 + and date <= self.config.reference_time + and date.date() not in self.config.ignore_dates + ): + return date + return None + + @abstractmethod + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + """ + Parses a document's filename and content, yielding valid datetime objects. + """ diff --git a/src/documents/plugins/date_parsing/regex_parser.py b/src/documents/plugins/date_parsing/regex_parser.py new file mode 100644 index 0000000000..2df8f92955 --- /dev/null +++ b/src/documents/plugins/date_parsing/regex_parser.py @@ -0,0 +1,65 @@ +import datetime +import re +from collections.abc import Iterator +from re import Match + +from documents.plugins.date_parsing.base import DateParserPluginBase + + +class RegexDateParserPlugin(DateParserPluginBase): + """ + The default date parser, using a series of regular expressions. + + It is configured entirely by the DateParserConfig object + passed to its constructor. + """ + + DATE_REGEX = re.compile( + r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", + re.IGNORECASE, + ) + + def _process_match( + self, + match: Match[str], + date_order: str, + ) -> datetime.datetime | None: + """ + Processes a single regex match using the base class helpers. + """ + date_string = match.group(0) + date = self._parse_string(date_string, date_order) + return self._filter_date(date) + + def _process_content( + self, + content: str, + date_order: str, + ) -> Iterator[datetime.datetime]: + """ + Finds all regex matches in content and yields valid dates. + """ + for m in re.finditer(self.DATE_REGEX, content): + date = self._process_match(m, date_order) + if date is not None: + yield date + + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + """ + Implementation of the abstract parse method. + + Reads its configuration from `self.config`. + """ + if self.config.filename_date_order: + yield from self._process_content( + filename, + self.config.filename_date_order, + ) + + yield from self._process_content(content, self.config.content_date_order) diff --git a/src/documents/tests/date_parsing/__init__.py b/src/documents/tests/date_parsing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/documents/tests/date_parsing/conftest.py b/src/documents/tests/date_parsing/conftest.py new file mode 100644 index 0000000000..ea9e2447da --- /dev/null +++ b/src/documents/tests/date_parsing/conftest.py @@ -0,0 +1,82 @@ +import datetime +from collections.abc import Generator +from typing import Any + +import pytest +import pytest_django + +from documents.plugins.date_parsing import _discover_parser_class +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +@pytest.fixture +def base_config() -> DateParserConfig: + """Basic configuration for date parser testing.""" + return DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates=set(), + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="YMD", + content_date_order="DMY", + ) + + +@pytest.fixture +def config_with_ignore_dates() -> DateParserConfig: + """Configuration with dates to ignore.""" + return DateParserConfig( + languages=["en", "de"], + timezone_str="America/New_York", + ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)}, + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="DMY", + content_date_order="MDY", + ) + + +@pytest.fixture +def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin: + """Instance of RegexDateParser with base config.""" + return RegexDateParserPlugin(base_config) + + +@pytest.fixture +def clear_lru_cache() -> Generator[None, None, None]: + """ + Ensure the LRU cache for _discover_parser_class is cleared + before and after any test that depends on it. + """ + _discover_parser_class.cache_clear() + yield + _discover_parser_class.cache_clear() + + +@pytest.fixture +def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any: + """ + Override Django settings for the duration of date parser tests. + """ + settings.DATE_PARSER_LANGUAGES = ["en", "de"] + settings.TIME_ZONE = "UTC" + settings.IGNORE_DATES = [datetime.date(1900, 1, 1)] + settings.FILENAME_DATE_ORDER = "YMD" + settings.DATE_ORDER = "DMY" + return settings diff --git a/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py b/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py new file mode 100644 index 0000000000..5d870fea10 --- /dev/null +++ b/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py @@ -0,0 +1,229 @@ +import datetime +import logging +from collections.abc import Iterator +from importlib.metadata import EntryPoint + +import pytest +import pytest_mock +from django.utils import timezone + +from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP +from documents.plugins.date_parsing import _discover_parser_class +from documents.plugins.date_parsing import get_date_parser +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.base import DateParserPluginBase +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +class AlphaParser(DateParserPluginBase): + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + yield timezone.now() + + +class BetaParser(DateParserPluginBase): + def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]: + yield timezone.now() + + +@pytest.mark.date_parsing +@pytest.mark.usefixtures("clear_lru_cache") +class TestDiscoverParserClass: + """Tests for the _discover_parser_class() function.""" + + def test_returns_default_when_no_plugins_found( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(), + ) + result = _discover_parser_class() + assert result is RegexDateParserPlugin + + def test_returns_default_when_entrypoint_query_fails( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing.entry_points", + side_effect=RuntimeError("boom"), + ) + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "Could not query entry points" in caplog.text + + def test_filters_out_invalid_plugins( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + fake_ep = mocker.MagicMock(spec=EntryPoint) + fake_ep.name = "bad_plugin" + fake_ep.load.return_value = object # not subclass of DateParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(fake_ep,), + ) + + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "does not subclass DateParser" in caplog.text + + def test_skips_plugins_that_fail_to_load( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + fake_ep = mocker.MagicMock(spec=EntryPoint) + fake_ep.name = "failing_plugin" + fake_ep.load.side_effect = ImportError("cannot import") + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(fake_ep,), + ) + + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "Unable to load date parser plugin failing_plugin" in caplog.text + + def test_returns_single_valid_plugin_without_warning( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + """If exactly one valid plugin is discovered, it should be returned without logging a warning.""" + + ep = mocker.MagicMock(spec=EntryPoint) + ep.name = "alpha" + ep.load.return_value = AlphaParser + + mock_entry_points = mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep,), + ) + + with caplog.at_level( + logging.WARNING, + logger="documents.plugins.date_parsing", + ): + result = _discover_parser_class() + + # It should have called entry_points with the correct group + mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP) + + # The discovered class should be exactly our AlphaParser + assert result is AlphaParser + + # No warnings should have been logged + assert not any( + "Multiple date parsers found" in record.message for record in caplog.records + ), "Unexpected warning logged when only one plugin was found" + + def test_returns_first_valid_plugin_by_name( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + ep_a = mocker.MagicMock(spec=EntryPoint) + ep_a.name = "alpha" + ep_a.load.return_value = AlphaParser + + ep_b = mocker.MagicMock(spec=EntryPoint) + ep_b.name = "beta" + ep_b.load.return_value = BetaParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep_b, ep_a), + ) + + result = _discover_parser_class() + assert result is AlphaParser + + def test_logs_warning_if_multiple_plugins_found( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + ep1 = mocker.MagicMock(spec=EntryPoint) + ep1.name = "a" + ep1.load.return_value = AlphaParser + + ep2 = mocker.MagicMock(spec=EntryPoint) + ep2.name = "b" + ep2.load.return_value = BetaParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep1, ep2), + ) + + with caplog.at_level( + logging.WARNING, + logger="documents.plugins.date_parsing", + ): + result = _discover_parser_class() + + # Should select alphabetically first plugin ("a") + assert result is AlphaParser + + # Should log a warning mentioning multiple parsers + assert any( + "Multiple date parsers found" in record.message for record in caplog.records + ), "Expected a warning about multiple date parsers" + + def test_cache_behavior_only_runs_once( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mock_entry_points = mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(), + ) + + # First call populates cache + _discover_parser_class() + # Second call should not re-invoke entry_points + _discover_parser_class() + mock_entry_points.assert_called_once() + + +@pytest.mark.django_db +@pytest.mark.date_parsing +@pytest.mark.usefixtures("mock_date_parser_settings") +class TestGetDateParser: + """Tests for the get_date_parser() factory function.""" + + def test_returns_instance_of_discovered_class( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing._discover_parser_class", + return_value=AlphaParser, + ) + parser = get_date_parser() + assert isinstance(parser, AlphaParser) + assert isinstance(parser.config, DateParserConfig) + assert parser.config.languages == ["en", "de"] + assert parser.config.timezone_str == "UTC" + assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)] + assert parser.config.filename_date_order == "YMD" + assert parser.config.content_date_order == "DMY" + # Check reference_time near now + delta = abs((parser.config.reference_time - timezone.now()).total_seconds()) + assert delta < 2 + + def test_uses_default_regex_parser_when_no_plugins( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing._discover_parser_class", + return_value=RegexDateParserPlugin, + ) + parser = get_date_parser() + assert isinstance(parser, RegexDateParserPlugin) diff --git a/src/documents/tests/date_parsing/test_date_parsing.py b/src/documents/tests/date_parsing/test_date_parsing.py new file mode 100644 index 0000000000..a587b32cc2 --- /dev/null +++ b/src/documents/tests/date_parsing/test_date_parsing.py @@ -0,0 +1,433 @@ +import datetime +import logging +from typing import Any + +import pytest +import pytest_mock + +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +@pytest.mark.date_parsing +class TestParseString: + """Tests for DateParser._parse_string method via RegexDateParser.""" + + @pytest.mark.parametrize( + ("date_string", "date_order", "expected_year"), + [ + pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"), + pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"), + pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"), + pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"), + pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"), + pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"), + pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"), + ], + ) + def test_parse_string_valid_formats( + self, + regex_parser: RegexDateParserPlugin, + date_string: str, + date_order: str, + expected_year: int, + ) -> None: + """Should correctly parse various valid date formats.""" + result = regex_parser._parse_string(date_string, date_order) + + assert result is not None + assert result.year == expected_year + + @pytest.mark.parametrize( + "invalid_string", + [ + pytest.param("not a date", id="plain_text"), + pytest.param("32/13/2024", id="invalid_day_month"), + pytest.param("", id="empty_string"), + pytest.param("abc123xyz", id="alphanumeric_gibberish"), + pytest.param("99/99/9999", id="out_of_range"), + ], + ) + def test_parse_string_invalid_input( + self, + regex_parser: RegexDateParserPlugin, + invalid_string: str, + ) -> None: + """Should return None for invalid date strings.""" + result = regex_parser._parse_string(invalid_string, "DMY") + + assert result is None + + def test_parse_string_handles_exceptions( + self, + caplog: pytest.LogCaptureFixture, + mocker: pytest_mock.MockerFixture, + regex_parser: RegexDateParserPlugin, + ) -> None: + """Should handle and log exceptions from dateparser gracefully.""" + with caplog.at_level( + logging.ERROR, + logger="documents.plugins.date_parsing.base", + ): + # We still need to mock dateparser.parse to force the exception + mocker.patch( + "documents.plugins.date_parsing.base.dateparser.parse", + side_effect=ValueError( + "Parsing error: 01/01/2024", + ), + ) + + # 1. Execute the function under test + result = regex_parser._parse_string("01/01/2024", "DMY") + + assert result is None + + # Check if an error was logged + assert len(caplog.records) == 1 + assert caplog.records[0].levelname == "ERROR" + + # Check if the specific error message is present + assert "Error while parsing date string" in caplog.text + # Optional: Check for the exact exception message if it's included in the log + assert "Parsing error: 01/01/2024" in caplog.text + + +@pytest.mark.date_parsing +class TestFilterDate: + """Tests for DateParser._filter_date method via RegexDateParser.""" + + @pytest.mark.parametrize( + ("date", "expected_output"), + [ + # Valid Dates + pytest.param( + datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc), + id="valid_past_date", + ), + pytest.param( + datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc), + id="exactly_at_reference", + ), + pytest.param( + datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc), + id="year_1901_valid", + ), + # Date is > reference_time + pytest.param( + datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc), + None, + id="future_date_day_after", + ), + # date.date() in ignore_dates + pytest.param( + datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_midnight_jan1", + ), + pytest.param( + datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_midday_jan1", + ), + pytest.param( + datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_dec25_future", + ), + # date.year <= 1900 + pytest.param( + datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc), + None, + id="year_1899", + ), + pytest.param( + datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc), + None, + id="year_1900_boundary", + ), + # date is None + pytest.param(None, None, id="none_input"), + ], + ) + def test_filter_date_validation_rules( + self, + config_with_ignore_dates: DateParserConfig, + date: datetime.datetime | None, + expected_output: datetime.datetime | None, + ) -> None: + """Should correctly validate dates against various rules.""" + parser = RegexDateParserPlugin(config_with_ignore_dates) + result = parser._filter_date(date) + assert result == expected_output + + def test_filter_date_respects_ignore_dates( + self, + config_with_ignore_dates: DateParserConfig, + ) -> None: + """Should filter out dates in the ignore_dates set.""" + parser = RegexDateParserPlugin(config_with_ignore_dates) + + ignored_date = datetime.datetime( + 2024, + 1, + 1, + 12, + 0, + tzinfo=datetime.timezone.utc, + ) + another_ignored = datetime.datetime( + 2024, + 12, + 25, + 15, + 30, + tzinfo=datetime.timezone.utc, + ) + allowed_date = datetime.datetime( + 2024, + 1, + 2, + 12, + 0, + tzinfo=datetime.timezone.utc, + ) + + assert parser._filter_date(ignored_date) is None + assert parser._filter_date(another_ignored) is None + assert parser._filter_date(allowed_date) == allowed_date + + def test_filter_date_timezone_aware( + self, + regex_parser: RegexDateParserPlugin, + ) -> None: + """Should work with timezone-aware datetimes.""" + date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc) + + result = regex_parser._filter_date(date_utc) + + assert result is not None + assert result.tzinfo is not None + + +@pytest.mark.date_parsing +class TestRegexDateParser: + @pytest.mark.parametrize( + ("filename", "content", "expected"), + [ + pytest.param( + "report-2023-12-25.txt", + "Event recorded on 25/12/2022.", + [ + datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc), + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ], + id="filename-y-m-d_and_content-d-m-y", + ), + pytest.param( + "img_2023.01.02.jpg", + "Taken on 01/02/2023", + [ + datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc), + datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc), + ], + id="ambiguous-dates-respect-orders", + ), + pytest.param( + "notes.txt", + "bad date 99/99/9999 and 25/12/2022", + [ + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ], + id="parse-exception-skips-bad-and-yields-good", + ), + ], + ) + def test_parse_returns_expected_dates( + self, + base_config: DateParserConfig, + mocker: pytest_mock.MockerFixture, + filename: str, + content: str, + expected: list[datetime.datetime], + ) -> None: + """ + High-level tests that exercise RegexDateParser.parse only. + dateparser.parse is mocked so tests are deterministic. + """ + parser = RegexDateParserPlugin(base_config) + + # Patch the dateparser.parse + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse( + date_string: str, + settings: dict[str, Any] | None = None, + locales: None = None, + ) -> datetime.datetime | None: + date_order = settings.get("DATE_ORDER") if settings else None + + # Filename-style YYYY-MM-DD / YYYY.MM.DD + if ( + "2023-12-25" in date_string + or "2023.12.25" in date_string + or "2023-12-25" in date_string + ): + return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc) + + # content DMY 25/12/2022 + if "25/12/2022" in date_string or "25-12-2022" in date_string: + return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc) + + # filename YMD 2023.01.02 + if "2023.01.02" in date_string or "2023-01-02" in date_string: + return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc) + + # ambiguous 01/02/2023 -> respect DATE_ORDER setting + if "01/02/2023" in date_string: + if date_order == "DMY": + return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc) + if date_order == "YMD": + return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc) + # fallback + return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc) + + # simulate parse failure for malformed input + if "99/99/9999" in date_string or "bad date" in date_string: + raise Exception("parse failed for malformed date") + + return None + + mocker.patch(target, side_effect=fake_parse) + + results = list(parser.parse(filename, content)) + + assert results == expected + for dt in results: + assert dt.tzinfo is not None + + def test_parse_filters_future_and_ignored_dates( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + """ + Ensure parser filters out: + - dates after reference_time + - dates whose .date() are in ignore_dates + """ + cfg = DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates={datetime.date(2023, 12, 10)}, + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="YMD", + content_date_order="DMY", + ) + parser = RegexDateParserPlugin(cfg) + + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse( + date_string: str, + settings: dict[str, Any] | None = None, + locales: None = None, + ) -> datetime.datetime | None: + if "10/12/2023" in date_string or "10-12-2023" in date_string: + # ignored date + return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc) + if "01/02/2024" in date_string or "01-02-2024" in date_string: + # future relative to reference_time -> filtered + return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc) + if "05/01/2023" in date_string or "05-01-2023" in date_string: + # valid + return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc) + return None + + mocker.patch(target, side_effect=fake_parse) + + content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023" + results = list(parser.parse("whatever.txt", content)) + + assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)] + + def test_parse_handles_no_matches_and_returns_empty_list( + self, + base_config: DateParserConfig, + ) -> None: + """ + When there are no matching date-like substrings, parse should yield nothing. + """ + parser = RegexDateParserPlugin(base_config) + results = list( + parser.parse("no-dates.txt", "this has no dates whatsoever"), + ) + assert results == [] + + def test_parse_skips_filename_when_filename_date_order_none( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + """ + When filename_date_order is None the parser must not attempt to parse the filename. + Only dates found in the content should be passed to dateparser.parse. + """ + cfg = DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates=set(), + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order=None, + content_date_order="DMY", + ) + parser = RegexDateParserPlugin(cfg) + + # Patch the module's dateparser.parse so we can inspect calls + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse( + date_string: str, + settings: dict[str, Any] | None = None, + locales: None = None, + ) -> datetime.datetime | None: + # return distinct datetimes so we can tell which source was parsed + if "25/12/2022" in date_string: + return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc) + if "2023-12-25" in date_string: + return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc) + return None + + mock = mocker.patch(target, side_effect=fake_parse) + + filename = "report-2023-12-25.txt" + content = "Event recorded on 25/12/2022." + + results = list(parser.parse(filename, content)) + + # Only the content date should have been parsed -> one call + assert mock.call_count == 1 + + # # first call, first positional arg + called_date_string = mock.call_args_list[0][0][0] + assert "25/12/2022" in called_date_string + # And the parser should have yielded the corresponding datetime + assert results == [ + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ] diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index e3ccd8e4e3..c362f96465 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1989,11 +1989,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") self.assertEqual(response.status_code, status.HTTP_200_OK) - @mock.patch("documents.parsers.parse_date_generator") + @mock.patch("documents.views.get_date_parser") @override_settings(NUMBER_OF_SUGGESTED_DATES=0) def test_get_suggestions_dates_disabled( self, - parse_date_generator, + mock_get_date_parser: mock.MagicMock, ): """ GIVEN: @@ -2010,7 +2010,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): ) self.client.get(f"/api/documents/{doc.pk}/suggestions/") - self.assertFalse(parse_date_generator.called) + + mock_get_date_parser.assert_not_called() def test_saved_views(self) -> None: u1 = User.objects.create_superuser("user1") diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py deleted file mode 100644 index c9c1fb8376..0000000000 --- a/src/documents/tests/test_date_parsing.py +++ /dev/null @@ -1,538 +0,0 @@ -import datetime -from zoneinfo import ZoneInfo - -import pytest -from pytest_django.fixtures import SettingsWrapper - -from documents.parsers import parse_date -from documents.parsers import parse_date_generator - - -@pytest.mark.django_db() -class TestDate: - def test_date_format_1(self) -> None: - text = "lorem ipsum 130218 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_2(self) -> None: - text = "lorem ipsum 2018 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_3(self) -> None: - text = "lorem ipsum 20180213 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_4(self, settings_timezone: ZoneInfo) -> None: - text = "lorem ipsum 13.02.2018 lorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) - - def test_date_format_5(self, settings_timezone: ZoneInfo) -> None: - text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) - - def test_date_format_6(self) -> None: - text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum" - ) - assert parse_date("", text) is None - - def test_date_format_7( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - settings.DATE_PARSER_LANGUAGES = ["de"] - text = "lorem ipsum\nMärz 2019\nlorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) - - def test_date_format_8( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - settings.DATE_PARSER_LANGUAGES = ["de"] - text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum\n" - "März 2020" - ) - assert parse_date("", text) == datetime.datetime( - 2020, - 3, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_9( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - settings.DATE_PARSER_LANGUAGES = ["de"] - text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" - assert parse_date("", text) == datetime.datetime( - 2020, - 3, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_10(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_11(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_12(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_13(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_14(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_15(self) -> None: - text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_16(self) -> None: - text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_17(self) -> None: - text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_18(self) -> None: - text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_19(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 21, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_20(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_21(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 2, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_22(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 23, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_23(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 24, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_24(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 21, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_25(self, settings_timezone: ZoneInfo) -> None: - text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 25, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_26(self, settings_timezone: ZoneInfo) -> None: - text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051" - assert parse_date("", text) == datetime.datetime( - 2019, - 9, - 25, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_crazy_date_past(self) -> None: - assert parse_date("", "01-07-0590 00:00:00") is None - - def test_crazy_date_future(self) -> None: - assert parse_date("", "01-07-2350 00:00:00") is None - - def test_crazy_date_with_spaces(self) -> None: - assert parse_date("", "20 408000l 2475") is None - - def test_utf_month_names( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"] - assert parse_date("", "13 décembre 2023") == datetime.datetime( - 2023, - 12, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "13 août 2022") == datetime.datetime( - 2022, - 8, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "11 März 2020") == datetime.datetime( - 2020, - 3, - 11, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17. ožujka 2018.") == datetime.datetime( - 2018, - 3, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "1. veljače 2016.") == datetime.datetime( - 2016, - 2, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "15. února 1985") == datetime.datetime( - 1985, - 2, - 15, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "30. září 2011") == datetime.datetime( - 2011, - 9, - 30, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "28. května 1990") == datetime.datetime( - 1990, - 5, - 28, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "1. grudzień 1997") == datetime.datetime( - 1997, - 12, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17 Şubat 2024") == datetime.datetime( - 2024, - 2, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "30 Ağustos 2012") == datetime.datetime( - 2012, - 8, - 30, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17 Eylül 2000") == datetime.datetime( - 2000, - 9, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "5. október 1992") == datetime.datetime( - 1992, - 10, - 5, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_multiple_dates(self, settings_timezone: ZoneInfo) -> None: - text = """This text has multiple dates. - For example 02.02.2018, 22 July 2022 and December 2021. - But not 24-12-9999 because it's in the future...""" - dates = list(parse_date_generator("", text)) - - assert dates == [ - datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone), - datetime.datetime( - 2022, - 7, - 22, - 0, - 0, - tzinfo=settings_timezone, - ), - datetime.datetime( - 2021, - 12, - 1, - 0, - 0, - tzinfo=settings_timezone, - ), - ] - - def test_filename_date_parse_valid_ymd( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Year Month Day (YMD) - - Filename contains date matching the format - - THEN: - - Should parse the date from the filename - """ - settings.FILENAME_DATE_ORDER = "YMD" - - assert parse_date( - "/tmp/Scan-2022-04-01.pdf", - "No date in here", - ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone) - - def test_filename_date_parse_valid_dmy( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Day Month Year (DMY) - - Filename contains date matching the format - - THEN: - - Should parse the date from the filename - """ - settings.FILENAME_DATE_ORDER = "DMY" - assert parse_date( - "/tmp/Scan-10.01.2021.pdf", - "No date in here", - ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone) - - def test_filename_date_parse_invalid(self, settings: SettingsWrapper) -> None: - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename includes no date - - File content includes no date - - THEN: - - No date is parsed - """ - settings.FILENAME_DATE_ORDER = "YMD" - assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None - - def test_filename_date_ignored_use_content( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Day Month Year (YMD) - - Date order is Day Month Year (DMY, the default) - - Filename contains date matching the format - - Filename date is an ignored date - - File content includes a date - - THEN: - - Should parse the date from the content not filename - """ - settings.FILENAME_DATE_ORDER = "YMD" - settings.IGNORE_DATES = (datetime.date(2022, 4, 1),) - assert parse_date( - "/tmp/Scan-2022-04-01.pdf", - "The matching date is 24.03.2022", - ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone) - - def test_ignored_dates_default_order( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - """ - GIVEN: - - Ignore dates have been set - - File content includes ignored dates - - File content includes 1 non-ignored date - - THEN: - - Should parse the date non-ignored date from content - """ - settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) - text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum" - assert parse_date("", text) == datetime.datetime( - 2018, - 2, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_ignored_dates_order_ymd( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ) -> None: - """ - GIVEN: - - Ignore dates have been set - - Date order is Year Month Date (YMD) - - File content includes ignored dates - - File content includes 1 non-ignored date - - THEN: - - Should parse the date non-ignored date from content - """ - - settings.FILENAME_DATE_ORDER = "YMD" - settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) - - text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum" - - assert parse_date("", text) == datetime.datetime( - 2018, - 2, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) diff --git a/src/documents/views.py b/src/documents/views.py index fbd72b10d1..c634c007e3 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -148,7 +148,6 @@ from documents.models import Workflow from documents.models import WorkflowAction from documents.models import WorkflowTrigger from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import parse_date_generator from documents.permissions import AcknowledgeTasksPermissions from documents.permissions import PaperlessAdminPermissions from documents.permissions import PaperlessNotePermissions @@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import has_perms_owner_aware from documents.permissions import set_permissions_for_object +from documents.plugins.date_parsing import get_date_parser from documents.schema import generate_object_with_permissions_schema from documents.serialisers import AcknowledgeTasksViewSerializer from documents.serialisers import BulkDownloadSerializer @@ -1023,16 +1023,17 @@ class DocumentViewSet( dates = [] if settings.NUMBER_OF_SUGGESTED_DATES > 0: - gen = parse_date_generator(doc.filename, doc.content) - dates = sorted( - { - i - for i in itertools.islice( - gen, - settings.NUMBER_OF_SUGGESTED_DATES, - ) - }, - ) + with get_date_parser() as date_parser: + gen = date_parser.parse(doc.filename, doc.content) + dates = sorted( + { + i + for i in itertools.islice( + gen, + settings.NUMBER_OF_SUGGESTED_DATES, + ) + }, + ) resp_data = { "correspondents": [