From 5f4e5b598cab86d5fd5727d423c9728221889ed0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Oct 2024 16:29:20 +0300 Subject: [PATCH] gh-53203: Fix strptime() for %c, %x and %X formats on many locales (GH-125406) Fixed most locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan. --- Lib/_strptime.py | 68 ++++++++++++------- Lib/test/test_strptime.py | 34 ++++++---- Lib/test/test_time.py | 2 +- ...4-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst | 2 + 4 files changed, 66 insertions(+), 40 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst diff --git a/Lib/_strptime.py b/Lib/_strptime.py index 89adc174e5ad30..5f4d2475c0169b 100644 --- a/Lib/_strptime.py +++ b/Lib/_strptime.py @@ -15,6 +15,7 @@ import locale import calendar from re import compile as re_compile +from re import sub as re_sub from re import IGNORECASE from re import escape as re_escape from datetime import (date as datetime_date, @@ -129,11 +130,23 @@ def __calc_date_time(self): time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0)) replacement_pairs = [ - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I'), + # Non-ASCII digits + ('\u0661\u0669\u0669\u0669', '%Y'), + ('\u0669\u0669', '%Oy'), + ('\u0662\u0662', '%OH'), + ('\u0664\u0664', '%OM'), + ('\u0665\u0665', '%OS'), + ('\u0661\u0667', '%Od'), + ('\u0660\u0663', '%Om'), + ('\u0663', '%Om'), + ('\u0662', '%Ow'), + ('\u0661\u0660', '%OI'), + ] date_time = [] for directive in ('%c', '%x', '%X'): current_format = time.strftime(directive, time_tuple).lower() @@ -158,6 +171,10 @@ def __calc_date_time(self): for tz in tz_values: if tz: current_format = current_format.replace(tz, "%Z") + # Transform all non-ASCII digits to digits in range U+0660 to U+0669. + current_format = re_sub(r'\d(?3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", 'f': r"(?P[0-9]{1,6})", @@ -296,11 +313,15 @@ def __init__(self, locale_time=None): 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone for tz in tz_names), 'Z'), - '%': '%'}) - base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) - base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) - base.__setitem__('x', self.pattern(self.locale_time.LC_date)) + '%': '%'} + for d in 'dmyHIMS': + mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d + mapping['Ow'] = r'(?P\d)' + mapping['W'] = mapping['U'].replace('U', 'W') + base.__init__(mapping) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + base.__setitem__('x', self.pattern(self.locale_time.LC_date)) + base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) def __seqToRE(self, to_convert, directive): """Convert a list to a regex string for matching a directive. @@ -328,28 +349,25 @@ def pattern(self, format): regex syntax are escaped. """ - processed_format = '' # The sub() call escapes all characters that might be misconstrued # as regex syntax. Cannot use re.escape since we have to deal with # format directives (%m, etc.). - regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") - format = regex_chars.sub(r"\\\1", format) - whitespace_replacement = re_compile(r'\s+') - format = whitespace_replacement.sub(r'\\s+', format) + format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format) + format = re_sub(r'\s+', r'\\s+', format) + format = re_sub(r"'", "['\u02bc]", format) # needed for br_FR year_in_format = False day_of_month_in_format = False - while '%' in format: - directive_index = format.index('%')+1 - format_char = format[directive_index] - processed_format = "%s%s%s" % (processed_format, - format[:directive_index-1], - self[format_char]) - format = format[directive_index+1:] + def repl(m): + format_char = m[1] match format_char: case 'Y' | 'y' | 'G': + nonlocal year_in_format year_in_format = True case 'd': + nonlocal day_of_month_in_format day_of_month_in_format = True + return self[format_char] + format = re_sub(r'%(O?.)', repl, format) if day_of_month_in_format and not year_in_format: import warnings warnings.warn("""\ @@ -360,7 +378,7 @@ def pattern(self, format): See https://github.com/python/cpython/issues/70647.""", DeprecationWarning, skip_file_prefixes=(os.path.dirname(__file__),)) - return "%s%s" % (processed_format, format) + return format def compile(self, format): """Return a compiled re object for the format string.""" @@ -434,8 +452,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"): _regex_cache[format] = format_regex found = format_regex.match(data_string) if not found: - raise ValueError("time data %r does not match format %r :: /%s/" % - (data_string, format, format_regex.pattern)) + raise ValueError("time data %r does not match format %r" % + (data_string, format)) if len(data_string) != found.end(): raise ValueError("unconverted data remains: %s" % data_string[found.end():]) diff --git a/Lib/test/test_strptime.py b/Lib/test/test_strptime.py index 79f48dfe44abde..12366b053a2fc1 100644 --- a/Lib/test/test_strptime.py +++ b/Lib/test/test_strptime.py @@ -292,7 +292,7 @@ def test_strptime_exception_context(self): # additional check for IndexError branch (issue #19545) with self.assertRaises(ValueError) as e: _strptime._strptime_time('19', '%Y %') - self.assertIs(e.exception.__suppress_context__, True) + self.assertIsNone(e.exception.__context__) def test_unconverteddata(self): # Check ValueError is raised when there is unconverted data @@ -485,12 +485,14 @@ def test_bad_timezone(self): # id_ID, ms_MY. # * Year is not included: ha_NG. # * Use non-Gregorian calendar: lo_LA, thai, th_TH. + # On Windows: ar_IN, ar_SA, fa_IR, ps_AF. # # BUG: Generates regexp that does not match the current date and time - # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM. + # for lzh_TW. @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', 'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG', - 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN') + 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN', + 'my_MM', 'or_IN', 'shn_MM', 'az_IR') def test_date_time_locale(self): # Test %c directive loc = locale.getlocale(locale.LC_TIME)[0] @@ -512,20 +514,23 @@ def test_date_time_locale(self): self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600)) # NB: Dates before 1969 do not roundtrip on some locales: - # bo_CN, bo_IN, dz_BT, eu_ES, eu_FR. + # az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN. @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', 'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG', - 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN') + 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN', + 'my_MM', 'shn_MM') def test_date_time_locale2(self): # Test %c directive self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0)) + self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0)) # NB: Does not roundtrip because use non-Gregorian calendar: - # lo_LA, thai, th_TH. + # lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF. # BUG: Generates regexp that does not match the current date - # for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM. + # for lzh_TW. @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', - 'he_IL', 'eu_ES', 'ar_AE') + 'he_IL', 'eu_ES', 'ar_AE', + 'az_IR', 'my_MM', 'or_IN', 'shn_MM') def test_date_locale(self): # Test %x directive now = time.time() @@ -545,10 +550,11 @@ def test_date_locale(self): "musl libc issue on Emscripten, bpo-46390" ) @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', - 'eu_ES', 'ar_AE') + 'eu_ES', 'ar_AE', 'my_MM', 'shn_MM') def test_date_locale2(self): # Test %x directive self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0)) + self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0)) # NB: Does not roundtrip in some locales due to the ambiguity of # the time representation (bugs in locales?): @@ -556,11 +562,11 @@ def test_date_locale2(self): # norwegian, nynorsk. # * Hours are in 12-hour notation without AM/PM indication: hy_AM, # ms_MY, sm_WS. - # BUG: Generates regexp that does not match the current time for - # aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET, - # lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET, - # so_SO, ti_ER, ti_ET, tig_ER, wal_ET. - @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP') + # BUG: Generates regexp that does not match the current time for lzh_TW. + @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', + 'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET', + 'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO', + 'ti_ET', 'tig_ER', 'wal_ET') def test_time_locale(self): # Test %X directive now = time.time() diff --git a/Lib/test/test_time.py b/Lib/test/test_time.py index 5b5779231f06ce..27c0f51acc58ab 100644 --- a/Lib/test/test_time.py +++ b/Lib/test/test_time.py @@ -298,7 +298,7 @@ def test_strptime_exception_context(self): # additional check for IndexError branch (issue #19545) with self.assertRaises(ValueError) as e: time.strptime('19', '%Y %') - self.assertIs(e.exception.__suppress_context__, True) + self.assertIsNone(e.exception.__context__) def test_strptime_leap_year(self): # GH-70647: warns if parsing a format with a day and no year. diff --git a/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst b/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst new file mode 100644 index 00000000000000..cdfa8c191e8242 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-13-20-21-35.gh-issue-53203.Rz1c8A.rst @@ -0,0 +1,2 @@ +Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many +locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.