# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
- logger.info("Attempting parsing from filename")
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
- logger.info(f"Found potential date: {date_string}")
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
date = __filter(date)
if date is not None:
- logger.info(f"Found date: {date}")
return date
- else:
- logger.info("Filtered date out")
- logger.info("Attempting parsing from content")
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
- logger.info(f"Found potential date: {date_string}")
try:
date = __parser(date_string, settings.DATE_ORDER)
date = __filter(date)
if date is not None:
- logger.info(f"Found date: {date}")
return date
- else:
- logger.info("Filtered date out")
return date
if PAPERLESS_TIKA_ENABLED:
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
-# List dates that should be ignored when trying to parse date from document text
-IGNORE_DATES: Set[datetime.date] = set()
+def _parse_ignore_dates(
+ env_ignore: str,
+ date_order: str = DATE_ORDER,
+) -> Set[datetime.datetime]:
+ """
+ If the PAPERLESS_IGNORE_DATES environment variable is set, parse the
+ user provided string(s) into dates
+
+ Args:
+ env_ignore (str): The value of the environment variable, comma seperated dates
+ date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
-def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]:
+ Returns:
+ Set[datetime.datetime]: The set of parsed date objects
+ """
import dateparser
ignored_dates = set()
for s in env_ignore.split(","):
- d = dateparser.parse(s)
+ d = dateparser.parse(
+ s,
+ settings={
+ "DATE_ORDER": date_order,
+ },
+ )
if d:
ignored_dates.add(d.date())
return ignored_dates
+# List dates that should be ignored when trying to parse date from document text
+IGNORE_DATES: Set[datetime.date] = set()
+
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
"""
+ def _parse_checker(self, test_cases):
+ """
+ Helper function to check ignore date parsing
+
+ Args:
+ test_cases (_type_): _description_
+ """
+ for env_str, date_format, expected_date_set in test_cases:
+
+ self.assertSetEqual(
+ _parse_ignore_dates(env_str, date_format),
+ expected_date_set,
+ )
+
def test_no_ignore_dates_set(self):
"""
GIVEN:
- All ignore dates are parsed
"""
test_cases = [
- ("1985-05-01", [datetime.date(1985, 5, 1)]),
+ ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}),
(
"1985-05-01,1991-12-05",
- [datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)],
+ "YMD",
+ {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)},
+ ),
+ ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}),
+ ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}),
+ (
+ "11.01.2001,15-06-1996",
+ "DMY",
+ {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)},
),
- ("2010-12-13", [datetime.date(2010, 12, 13)]),
]
- for env_str, expected_dates in test_cases:
- expected_date_set = set()
-
- for expected_date in expected_dates:
- expected_date_set.add(expected_date)
- self.assertSetEqual(
- _parse_ignore_dates(env_str),
- expected_date_set,
- )
+ self._parse_checker(test_cases)