[3.12] gh-53203: Fix strptime() for %c, %x and %X formats on many locales (GH-125406...

author Serhiy Storchaka <storchaka@gmail.com>

Mon, 14 Oct 2024 20:59:01 +0000 (23:59 +0300)

committer GitHub <noreply@github.com>

Mon, 14 Oct 2024 20:59:01 +0000 (20:59 +0000)
author Serhiy Storchaka <storchaka@gmail.com>
Mon, 14 Oct 2024 20:59:01 +0000 (23:59 +0300)
committer GitHub <noreply@github.com>
Mon, 14 Oct 2024 20:59:01 +0000 (20:59 +0000)
diff --git a/Lib/_strptime.py b/Lib/_strptime.py

index d740c15519a75d10bd827acdd6239d019c789e88..dfd2bc5d8b4af5985f6374086bd0fb8aa5b49437 100644 (file)
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -14,6 +14,7 @@ import time
  import locale
  import calendar
  from re import compile as re_compile
+from re import sub as re_sub
  from re import IGNORECASE
  from re import escape as re_escape
  from datetime import (date as datetime_date,
@@ -128,11 +129,23 @@ class LocaleTime(object):
          time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
          time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
          replacement_pairs = [
-                    ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
-                    ('44', '%M'), ('55', '%S'), ('76', '%j'),
-                    ('17', '%d'), ('03', '%m'), ('3', '%m'),
-                    # '3' needed for when no leading zero.
-                    ('2', '%w'), ('10', '%I')]
+            ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
+            ('44', '%M'), ('55', '%S'), ('76', '%j'),
+            ('17', '%d'), ('03', '%m'), ('3', '%m'),
+            # '3' needed for when no leading zero.
+            ('2', '%w'), ('10', '%I'),
+            # Non-ASCII digits
+            ('\u0661\u0669\u0669\u0669', '%Y'),
+            ('\u0669\u0669', '%Oy'),
+            ('\u0662\u0662', '%OH'),
+            ('\u0664\u0664', '%OM'),
+            ('\u0665\u0665', '%OS'),
+            ('\u0661\u0667', '%Od'),
+            ('\u0660\u0663', '%Om'),
+            ('\u0663', '%Om'),
+            ('\u0662', '%Ow'),
+            ('\u0661\u0660', '%OI'),
+        ]
          date_time = []
          for directive in ('%c', '%x', '%X'):
              current_format = time.strftime(directive, time_tuple).lower()
@@ -157,6 +170,10 @@ class LocaleTime(object):
                  for tz in tz_values:
                      if tz:
                          current_format = current_format.replace(tz, "%Z")
+            # Transform all non-ASCII digits to digits in range U+0660 to U+0669.
+            current_format = re_sub(r'\d(?<![0-9])',
+                                    lambda m: chr(0x0660 + int(m[0])),
+                                    current_format)
              for old, new in replacement_pairs:
                  current_format = current_format.replace(old, new)
              # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@@ -266,7 +283,7 @@ class TimeRE(dict):
          else:
              self.locale_time = LocaleTime()
          base = super()
-        base.__init__({
+        mapping = {
              # The " [1-9]" part of the regex is to make %c from ANSI C work
              'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
              'f': r"(?P<f>[0-9]{1,6})",
@@ -295,11 +312,15 @@ class TimeRE(dict):
              'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
                                          for tz in tz_names),
                                  'Z'),
-            '%': '%'})
-        base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
-        base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
-        base.__setitem__('x', self.pattern(self.locale_time.LC_date))
+            '%': '%'}
+        for d in 'dmyHIMS':
+            mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
+        mapping['Ow'] = r'(?P<w>\d)'
+        mapping['W'] = mapping['U'].replace('U', 'W')
+        base.__init__(mapping)
          base.__setitem__('X', self.pattern(self.locale_time.LC_time))
+        base.__setitem__('x', self.pattern(self.locale_time.LC_date))
+        base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
  
      def __seqToRE(self, to_convert, directive):
          """Convert a list to a regex string for matching a directive.
@@ -327,21 +348,16 @@ class TimeRE(dict):
          regex syntax are escaped.
  
          """
-        processed_format = ''
          # The sub() call escapes all characters that might be misconstrued
          # as regex syntax.  Cannot use re.escape since we have to deal with
          # format directives (%m, etc.).
-        regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
-        format = regex_chars.sub(r"\\\1", format)
-        whitespace_replacement = re_compile(r'\s+')
-        format = whitespace_replacement.sub(r'\\s+', format)
-        while '%' in format:
-            directive_index = format.index('%')+1
-            processed_format = "%s%s%s" % (processed_format,
-                                           format[:directive_index-1],
-                                           self[format[directive_index]])
-            format = format[directive_index+1:]
-        return "%s%s" % (processed_format, format)
+        format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format)
+        format = re_sub(r'\s+', r'\\s+', format)
+        format = re_sub(r"'", "['\u02bc]", format)  # needed for br_FR
+        def repl(m):
+            return self[m[1]]
+        format = re_sub(r'%(O?.)', repl, format)
+        return format
  
      def compile(self, format):
          """Return a compiled re object for the format string."""
@@ -415,8 +431,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
              _regex_cache[format] = format_regex
      found = format_regex.match(data_string)
      if not found:
-        raise ValueError("time data %r does not match format %r :: /%s/" %
-                         (data_string, format, format_regex.pattern))
+        raise ValueError("time data %r does not match format %r" %
+                         (data_string, format))
      if len(data_string) != found.end():
          raise ValueError("unconverted data remains: %s" %
                            data_string[found.end():])
diff --git a/Lib/test/test_strptime.py b/Lib/test/test_strptime.py

index fa7915bead83fde22e821bd57a33337b7f8bc56f..45b5c8ed2c30510b5c559a1d1c630bc3d9a240c6 100644 (file)
--- a/Lib/test/test_strptime.py
+++ b/Lib/test/test_strptime.py
@@ -290,7 +290,7 @@ class StrptimeTests(unittest.TestCase):
          # additional check for IndexError branch (issue #19545)
          with self.assertRaises(ValueError) as e:
              _strptime._strptime_time('19', '%Y %')
-        self.assertIs(e.exception.__suppress_context__, True)
+        self.assertIsNone(e.exception.__context__)
  
      def test_unconverteddata(self):
          # Check ValueError is raised when there is unconverted data
@@ -483,12 +483,14 @@ class StrptimeTests(unittest.TestCase):
      #   id_ID, ms_MY.
      # * Year is not included: ha_NG.
      # * Use non-Gregorian calendar: lo_LA, thai, th_TH.
+    #   On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
      #
      # BUG: Generates regexp that does not match the current date and time
-    # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
+    # for lzh_TW.
      @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
                        'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
-                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
+                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
+                      'my_MM', 'or_IN', 'shn_MM', 'az_IR')
      def test_date_time_locale(self):
          # Test %c directive
          loc = locale.getlocale(locale.LC_TIME)[0]
@@ -510,20 +512,23 @@ class StrptimeTests(unittest.TestCase):
          self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600))
  
      # NB: Dates before 1969 do not roundtrip on some locales:
-    # bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
+    # az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN.
      @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
                        'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
-                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
+                      'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
+                      'my_MM', 'shn_MM')
      def test_date_time_locale2(self):
          # Test %c directive
          self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
+        self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
  
      # NB: Does not roundtrip because use non-Gregorian calendar:
-    # lo_LA, thai, th_TH.
+    # lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
      # BUG: Generates regexp that does not match the current date
-    # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
+    # for lzh_TW.
      @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
-                      'he_IL', 'eu_ES', 'ar_AE')
+                      'he_IL', 'eu_ES', 'ar_AE',
+                      'az_IR', 'my_MM', 'or_IN', 'shn_MM')
      def test_date_locale(self):
          # Test %x directive
          now = time.time()
@@ -543,10 +548,11 @@ class StrptimeTests(unittest.TestCase):
          "musl libc issue on Emscripten, bpo-46390"
      )
      @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
-                      'eu_ES', 'ar_AE')
+                      'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
      def test_date_locale2(self):
          # Test %x directive
          self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
+        self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
  
      # NB: Does not roundtrip in some locales due to the ambiguity of
      # the time representation (bugs in locales?):
@@ -554,19 +560,27 @@ class StrptimeTests(unittest.TestCase):
      #   norwegian, nynorsk.
      # * Hours are in 12-hour notation without AM/PM indication: hy_AM,
      #   ms_MY, sm_WS.
-    # BUG: Generates regexp that does not match the current time for
-    # aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET,
-    # lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET,
-    # so_SO, ti_ER, ti_ET, tig_ER, wal_ET.
-    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
+    # BUG: Generates regexp that does not match the current time for lzh_TW.
+    @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
+                      'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
+                      'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
+                      'ti_ET', 'tig_ER', 'wal_ET')
      def test_time_locale(self):
          # Test %X directive
+        loc = locale.getlocale(locale.LC_TIME)[0]
+        pos = slice(3, 6)
+        if glibc_ver and glibc_ver < (2, 29) and loc in {
+                'aa_ET', 'am_ET', 'byn_ER', 'gez_ET', 'om_ET',
+                'sid_ET', 'so_SO', 'ti_ET', 'tig_ER', 'wal_ET'}:
+            # Hours are in 12-hour notation without AM/PM indication.
+            # Ignore hours.
+            pos = slice(4, 6)
          now = time.time()
-        self.roundtrip('%X', slice(3, 6), time.localtime(now))
+        self.roundtrip('%X', pos, time.localtime(now))
          # 1 hour 20 minutes 30 seconds ago
-        self.roundtrip('%X', slice(3, 6), time.localtime(now - 4830))
+        self.roundtrip('%X', pos, time.localtime(now - 4830))
          # 12 hours ago
-        self.roundtrip('%X', slice(3, 6), time.localtime(now - 12*3600))
+        self.roundtrip('%X', pos, time.localtime(now - 12*3600))
  
      def test_percent(self):
          # Make sure % signs are handled properly
diff --git a/Lib/test/test_time.py b/Lib/test/test_time.py

index 921e4eea649d6ba2570663c3c140e12bbf835d3a..ca40415f0cdd768b3e7c2f7c906fe583446f8ee7 100644 (file)
--- a/Lib/test/test_time.py
+++ b/Lib/test/test_time.py
@@ -292,7 +292,7 @@ class TimeTestCase(unittest.TestCase):
          # additional check for IndexError branch (issue #19545)
          with self.assertRaises(ValueError) as e:
              time.strptime('19', '%Y %')
-        self.assertIs(e.exception.__suppress_context__, True)
+        self.assertIsNone(e.exception.__context__)
  
      def test_asctime(self):
          time.asctime(time.gmtime(self.t))
diff --git a/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst b/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst

new file mode 100644 (file)

index 0000000..cdfa8c1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst
@@ -0,0 +1,2 @@
+Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many
+locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.
author	Serhiy Storchaka <storchaka@gmail.com>
	Mon, 14 Oct 2024 20:59:01 +0000 (23:59 +0300)
committer	GitHub <noreply@github.com>
	Mon, 14 Oct 2024 20:59:01 +0000 (20:59 +0000)
Lib/_strptime.py		patch \| blob \| blame \| history
Lib/test/test_strptime.py		patch \| blob \| blame \| history
Lib/test/test_time.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst	[new file with mode: 0644]	patch \| blob