from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING:
import datetime
"""
import dateparser
+ ocr_config = OcrConfig()
+ languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
+ ocr_config.language,
+ )
+
return dateparser.parse(
ds,
settings={
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE,
},
- locales=settings.DATE_PARSER_LANGUAGES,
+ locales=languages,
)
def __filter(date: datetime.datetime) -> datetime.datetime | None:
import datetime
from zoneinfo import ZoneInfo
+import pytest
from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
+@pytest.mark.django_db()
class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
- settings.DATE_PARSER_LANGUAGES = []
+ settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv
-from paperless.utils import ocr_to_dateparser_languages
-
logger = logging.getLogger("paperless.settings")
# Tap paperless.conf if it's available
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
-def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
- """
- Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
- into a list of locales compatible with the `dateparser` library.
-
- - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
- Falls back to the base language (e.g., "az") if needed.
- - If a language cannot be mapped or validated, it is skipped with a warning.
- - Returns a list of valid locales, or an empty list if none could be converted.
- """
- ocr_to_dateparser = ocr_to_dateparser_languages()
- loader = LocaleDataLoader()
- result = []
- try:
- for ocr_language in ocr_languages.split("+"):
- # Split into language and optional script
- ocr_lang_part, *script = ocr_language.split("_")
- ocr_script_part = script[0] if script else None
-
- language_part = ocr_to_dateparser.get(ocr_lang_part)
- if language_part is None:
- logger.debug(
- f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
- )
- continue
-
- # Ensure base language is supported by dateparser
- loader.get_locale_map(locales=[language_part])
-
- # Try to add the script part if it's supported by dateparser
- if ocr_script_part:
- dateparser_language = f"{language_part}-{ocr_script_part.title()}"
- try:
- loader.get_locale_map(locales=[dateparser_language])
- except Exception:
- logger.info(
- f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
- )
- dateparser_language = language_part
- else:
- dateparser_language = language_part
- if dateparser_language not in result:
- result.append(dateparser_language)
- except Exception as e:
- logger.warning(
- f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
- )
- return []
- if not result:
- logger.info(
- "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
- )
- return result
-
-
def _parse_dateparser_languages(languages: str | None):
language_list = languages.split("+") if languages else []
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
return list(LocaleDataLoader().get_locale_map(locales=language_list))
-if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
- DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
+# If not set, we will infer it at runtime
+DATE_PARSER_LANGUAGES = (
+ _parse_dateparser_languages(
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
)
-else:
- DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
+ if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
+ else None
+)
# Maximum number of dates taken from document start to end to show as suggestions for
import pytest
from celery.schedules import crontab
-from paperless.settings import _ocr_to_dateparser_languages
from paperless.settings import _parse_base_paths
from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_dateparser_languages
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
-@pytest.mark.parametrize(
- ("ocr_language", "expected"),
- [
- # One language
- ("eng", ["en"]),
- # Multiple languages
- ("fra+ita+lao", ["fr", "it", "lo"]),
- # Languages that don't have a two-letter equivalent
- ("fil", ["fil"]),
- # Languages with a script part supported by dateparser
- ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
- # Languages with a script part not supported by dateparser
- # In this case, default to the language without script
- ("deu_frak", ["de"]),
- # Traditional and simplified chinese don't have the same name in dateparser,
- # so they're converted to the general chinese language
- ("chi_tra+chi_sim", ["zh"]),
- # If a language is not supported by dateparser, fallback to the supported ones
- ("eng+unsupported_language+por", ["en", "pt"]),
- # If no language is supported, fallback to default
- ("unsupported1+unsupported2", []),
- ],
-)
-def test_ocr_to_dateparser_languages(ocr_language, expected):
- assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
-
-
@pytest.mark.parametrize(
("languages", "expected"),
[
--- /dev/null
+import logging
+
+import pytest
+
+from paperless import utils
+from paperless.utils import ocr_to_dateparser_languages
+
+
+@pytest.mark.parametrize(
+ ("ocr_language", "expected"),
+ [
+ # One language
+ ("eng", ["en"]),
+ # Multiple languages
+ ("fra+ita+lao", ["fr", "it", "lo"]),
+ # Languages that don't have a two-letter equivalent
+ ("fil", ["fil"]),
+ # Languages with a script part supported by dateparser
+ ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
+ # Languages with a script part not supported by dateparser
+ # In this case, default to the language without script
+ ("deu_frak", ["de"]),
+ # Traditional and simplified chinese don't have the same name in dateparser,
+ # so they're converted to the general chinese language
+ ("chi_tra+chi_sim", ["zh"]),
+ # If a language is not supported by dateparser, fallback to the supported ones
+ ("eng+unsupported_language+por", ["en", "pt"]),
+ # If no language is supported, fallback to default
+ ("unsupported1+unsupported2", []),
+ # Duplicate languages, should not duplicate in result
+ ("eng+eng", ["en"]),
+ # Language with script, but script is not mapped
+ ("ita_unknownscript", ["it"]),
+ ],
+)
+def test_ocr_to_dateparser_languages(ocr_language, expected):
+ assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
+
+
+def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
+ # Patch LocaleDataLoader.get_locale_map to raise an exception
+ class DummyLoader:
+ def get_locale_map(self, locales=None):
+ raise RuntimeError("Simulated error")
+
+ with caplog.at_level(logging.WARNING):
+ monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
+ result = utils.ocr_to_dateparser_languages("eng+fra")
+ assert result == []
+ assert (
+ "Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
+ )
-def ocr_to_dateparser_languages() -> dict[str, str]:
+import logging
+
+from dateparser.languages.loader import LocaleDataLoader
+
+logger = logging.getLogger("paperless.utils")
+
+OCR_TO_DATEPARSER_LANGUAGES = {
"""
Translation map from languages supported by Tesseract OCR
to languages supported by dateparser.
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
- return {
- "afr": "af",
- "amh": "am",
- "ara": "ar",
- "asm": "as",
- "ast": "ast",
- "aze": "az",
- "bel": "be",
- "bul": "bg",
- "ben": "bn",
- "bod": "bo",
- "bre": "br",
- "bos": "bs",
- "cat": "ca",
- "cher": "chr",
- "ces": "cs",
- "cym": "cy",
- "dan": "da",
- "deu": "de",
- "dzo": "dz",
- "ell": "el",
- "eng": "en",
- "epo": "eo",
- "spa": "es",
- "est": "et",
- "eus": "eu",
- "fas": "fa",
- "fin": "fi",
- "fil": "fil",
- "fao": "fo", # codespell:ignore
- "fra": "fr",
- "fry": "fy",
- "gle": "ga",
- "gla": "gd",
- "glg": "gl",
- "guj": "gu",
- "heb": "he",
- "hin": "hi",
- "hrv": "hr",
- "hun": "hu",
- "hye": "hy",
- "ind": "id",
- "isl": "is",
- "ita": "it",
- "jpn": "ja",
- "kat": "ka",
- "kaz": "kk",
- "khm": "km",
- "knda": "kn",
- "kor": "ko",
- "kir": "ky",
- "ltz": "lb",
- "lao": "lo",
- "lit": "lt",
- "lav": "lv",
- "mal": "ml",
- "mon": "mn",
- "mar": "mr",
- "msa": "ms",
- "mlt": "mt",
- "mya": "my",
- "nep": "ne",
- "nld": "nl",
- "ori": "or",
- "pan": "pa",
- "pol": "pl",
- "pus": "ps",
- "por": "pt",
- "que": "qu",
- "ron": "ro",
- "rus": "ru",
- "sin": "si",
- "slk": "sk",
- "slv": "sl",
- "sqi": "sq",
- "srp": "sr",
- "swe": "sv",
- "swa": "sw",
- "tam": "ta",
- "tel": "te", # codespell:ignore
- "tha": "th", # codespell:ignore
- "tir": "ti",
- "tgl": "tl",
- "ton": "to",
- "tur": "tr",
- "uig": "ug",
- "ukr": "uk",
- "urd": "ur",
- "uzb": "uz",
- "via": "vi",
- "yid": "yi",
- "yor": "yo",
- "chi": "zh",
- }
+ "afr": "af",
+ "amh": "am",
+ "ara": "ar",
+ "asm": "as",
+ "ast": "ast",
+ "aze": "az",
+ "bel": "be",
+ "bul": "bg",
+ "ben": "bn",
+ "bod": "bo",
+ "bre": "br",
+ "bos": "bs",
+ "cat": "ca",
+ "cher": "chr",
+ "ces": "cs",
+ "cym": "cy",
+ "dan": "da",
+ "deu": "de",
+ "dzo": "dz",
+ "ell": "el",
+ "eng": "en",
+ "epo": "eo",
+ "spa": "es",
+ "est": "et",
+ "eus": "eu",
+ "fas": "fa",
+ "fin": "fi",
+ "fil": "fil",
+ "fao": "fo", # codespell:ignore
+ "fra": "fr",
+ "fry": "fy",
+ "gle": "ga",
+ "gla": "gd",
+ "glg": "gl",
+ "guj": "gu",
+ "heb": "he",
+ "hin": "hi",
+ "hrv": "hr",
+ "hun": "hu",
+ "hye": "hy",
+ "ind": "id",
+ "isl": "is",
+ "ita": "it",
+ "jpn": "ja",
+ "kat": "ka",
+ "kaz": "kk",
+ "khm": "km",
+ "knda": "kn",
+ "kor": "ko",
+ "kir": "ky",
+ "ltz": "lb",
+ "lao": "lo",
+ "lit": "lt",
+ "lav": "lv",
+ "mal": "ml",
+ "mon": "mn",
+ "mar": "mr",
+ "msa": "ms",
+ "mlt": "mt",
+ "mya": "my",
+ "nep": "ne",
+ "nld": "nl",
+ "ori": "or",
+ "pan": "pa",
+ "pol": "pl",
+ "pus": "ps",
+ "por": "pt",
+ "que": "qu",
+ "ron": "ro",
+ "rus": "ru",
+ "sin": "si",
+ "slk": "sk",
+ "slv": "sl",
+ "sqi": "sq",
+ "srp": "sr",
+ "swe": "sv",
+ "swa": "sw",
+ "tam": "ta",
+ "tel": "te", # codespell:ignore
+ "tha": "th", # codespell:ignore
+ "tir": "ti",
+ "tgl": "tl",
+ "ton": "to",
+ "tur": "tr",
+ "uig": "ug",
+ "ukr": "uk",
+ "urd": "ur",
+ "uzb": "uz",
+ "via": "vi",
+ "yid": "yi",
+ "yor": "yo",
+ "chi": "zh",
+}
+
+
+def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
+ """
+ Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
+ into a list of locales compatible with the `dateparser` library.
+
+ - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
+ Falls back to the base language (e.g., "az") if needed.
+ - If a language cannot be mapped or validated, it is skipped with a warning.
+ - Returns a list of valid locales, or an empty list if none could be converted.
+ """
+ loader = LocaleDataLoader()
+ result = []
+ try:
+ for ocr_language in ocr_languages.split("+"):
+ # Split into language and optional script
+ ocr_lang_part, *script = ocr_language.split("_")
+ ocr_script_part = script[0] if script else None
+
+ language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
+ if language_part is None:
+ logger.debug(
+ f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
+ )
+ continue
+
+ # Ensure base language is supported by dateparser
+ loader.get_locale_map(locales=[language_part])
+
+ # Try to add the script part if it's supported by dateparser
+ if ocr_script_part:
+ dateparser_language = f"{language_part}-{ocr_script_part.title()}"
+ try:
+ loader.get_locale_map(locales=[dateparser_language])
+ except Exception:
+ logger.info(
+ f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
+ )
+ dateparser_language = language_part
+ else:
+ dateparser_language = language_part
+ if dateparser_language not in result:
+ result.append(dateparser_language)
+ except Exception as e:
+ logger.warning(
+ f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
+ )
+ return []
+ if not result:
+ logger.info(
+ "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
+ )
+ return result