]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Enhancement: improve date parsing with accented characters (#7100)
authorFabien Dubuy <59292746+fdubuy@users.noreply.github.com>
Thu, 27 Jun 2024 03:47:37 +0000 (05:47 +0200)
committerGitHub <noreply@github.com>
Thu, 27 Jun 2024 03:47:37 +0000 (20:47 -0700)
src/documents/parsers.py
src/documents/tests/test_date_parsing.py

index ed70f653d3fdcf8d0743100a0f8081b107544bc7..09b1442c00150492bd4e0f1589cb79356a095360 100644 (file)
@@ -37,13 +37,14 @@ from documents.utils import run_subprocess
 # TODO: isn't there a date parsing library for this?
 
 DATE_REGEX = re.compile(
-    r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-Z]{3,9} [0-9]{4}|[a-zA-Z]{3,9} [0-9]{1,2}, [0-9]{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))",
+    r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))(\d{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
+    r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
+    re.IGNORECASE,
 )
 
 
index 006ae5e960545fb48004afdf2bc960e88a908ff9..25309527541ea2b00f3969f58a39a7c045269079 100644 (file)
@@ -192,15 +192,69 @@ class TestDate(TestCase):
             datetime.datetime(2019, 9, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
         )
 
-    def test_crazy_date_past(self, *args):
+    def test_crazy_date_past(self):
         self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
 
-    def test_crazy_date_future(self, *args):
+    def test_crazy_date_future(self):
         self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
 
-    def test_crazy_date_with_spaces(self, *args):
+    def test_crazy_date_with_spaces(self):
         self.assertIsNone(parse_date("", "20 408000l 2475"))
 
+    def test_utf_month_names(self):
+        self.assertEqual(
+            parse_date("", "13 décembre 2023"),
+            datetime.datetime(2023, 12, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "13 août 2022"),
+            datetime.datetime(2022, 8, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "11 März 2020"),
+            datetime.datetime(2020, 3, 11, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "17. ožujka 2018."),
+            datetime.datetime(2018, 3, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "1. veljače 2016."),
+            datetime.datetime(2016, 2, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "15. února 1985"),
+            datetime.datetime(1985, 2, 15, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "30. září 2011"),
+            datetime.datetime(2011, 9, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "28. května 1990"),
+            datetime.datetime(1990, 5, 28, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "1. grudzień 1997"),
+            datetime.datetime(1997, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "17 Şubat 2024"),
+            datetime.datetime(2024, 2, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "30 Ağustos 2012"),
+            datetime.datetime(2012, 8, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "17 Eylül 2000"),
+            datetime.datetime(2000, 9, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+        self.assertEqual(
+            parse_date("", "5. október 1992"),
+            datetime.datetime(1992, 10, 5, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
     def test_multiple_dates(self):
         text = """This text has multiple dates.
                   For example 02.02.2018, 22 July 2022 and December 2021.