From f912cc0e413f667a8cc257a41775272bc641b0d8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 22 Apr 2022 21:37:46 +0300 Subject: [PATCH] gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660) Also test that all extra cases are in BMP. --- Lib/re/_casefix.py | 106 ++++++++++++++++++ Lib/re/_compiler.py | 59 +--------- Makefile.pre.in | 6 + ...2-04-18-12-52-16.gh-issue-91575.fK1TEh.rst | 3 + Tools/scripts/generate_re_casefix.py | 95 ++++++++++++++++ 5 files changed, 212 insertions(+), 57 deletions(-) create mode 100644 Lib/re/_casefix.py create mode 100644 Misc/NEWS.d/next/Tools-Demos/2022-04-18-12-52-16.gh-issue-91575.fK1TEh.rst create mode 100755 Tools/scripts/generate_re_casefix.py diff --git a/Lib/re/_casefix.py b/Lib/re/_casefix.py new file mode 100644 index 00000000000000..06507d08bee02b --- /dev/null +++ b/Lib/re/_casefix.py @@ -0,0 +1,106 @@ +# Auto-generated by Tools/scripts/generate_re_casefix.py. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { + # LATIN SMALL LETTER I: LATIN SMALL LETTER DOTLESS I + 0x0069: (0x0131,), # 'i': 'ı' + # LATIN SMALL LETTER S: LATIN SMALL LETTER LONG S + 0x0073: (0x017f,), # 's': 'ſ' + # MICRO SIGN: GREEK SMALL LETTER MU + 0x00b5: (0x03bc,), # 'µ': 'μ' + # LATIN SMALL LETTER DOTLESS I: LATIN SMALL LETTER I + 0x0131: (0x0069,), # 'ı': 'i' + # LATIN SMALL LETTER LONG S: LATIN SMALL LETTER S + 0x017f: (0x0073,), # 'ſ': 's' + # COMBINING GREEK YPOGEGRAMMENI: GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + 0x0345: (0x03b9, 0x1fbe), # '\u0345': 'ιι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + 0x0390: (0x1fd3,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + 0x03b0: (0x1fe3,), # 'ΰ': 'ΰ' + # GREEK SMALL LETTER BETA: GREEK BETA SYMBOL + 0x03b2: (0x03d0,), # 'β': 'ϐ' + # GREEK SMALL LETTER EPSILON: GREEK LUNATE EPSILON SYMBOL + 0x03b5: (0x03f5,), # 'ε': 'ϵ' + # GREEK SMALL LETTER THETA: GREEK THETA SYMBOL + 0x03b8: (0x03d1,), # 'θ': 'ϑ' + # GREEK SMALL LETTER IOTA: COMBINING GREEK YPOGEGRAMMENI, GREEK PROSGEGRAMMENI + 0x03b9: (0x0345, 0x1fbe), # 'ι': '\u0345ι' + # GREEK SMALL LETTER KAPPA: GREEK KAPPA SYMBOL + 0x03ba: (0x03f0,), # 'κ': 'ϰ' + # GREEK SMALL LETTER MU: MICRO SIGN + 0x03bc: (0x00b5,), # 'μ': 'µ' + # GREEK SMALL LETTER PI: GREEK PI SYMBOL + 0x03c0: (0x03d6,), # 'π': 'ϖ' + # GREEK SMALL LETTER RHO: GREEK RHO SYMBOL + 0x03c1: (0x03f1,), # 'ρ': 'ϱ' + # GREEK SMALL LETTER FINAL SIGMA: GREEK SMALL LETTER SIGMA + 0x03c2: (0x03c3,), # 'ς': 'σ' + # GREEK SMALL LETTER SIGMA: GREEK SMALL LETTER FINAL SIGMA + 0x03c3: (0x03c2,), # 'σ': 'ς' + # GREEK SMALL LETTER PHI: GREEK PHI SYMBOL + 0x03c6: (0x03d5,), # 'φ': 'ϕ' + # GREEK BETA SYMBOL: GREEK SMALL LETTER BETA + 0x03d0: (0x03b2,), # 'ϐ': 'β' + # GREEK THETA SYMBOL: GREEK SMALL LETTER THETA + 0x03d1: (0x03b8,), # 'ϑ': 'θ' + # GREEK PHI SYMBOL: GREEK SMALL LETTER PHI + 0x03d5: (0x03c6,), # 'ϕ': 'φ' + # GREEK PI SYMBOL: GREEK SMALL LETTER PI + 0x03d6: (0x03c0,), # 'ϖ': 'π' + # GREEK KAPPA SYMBOL: GREEK SMALL LETTER KAPPA + 0x03f0: (0x03ba,), # 'ϰ': 'κ' + # GREEK RHO SYMBOL: GREEK SMALL LETTER RHO + 0x03f1: (0x03c1,), # 'ϱ': 'ρ' + # GREEK LUNATE EPSILON SYMBOL: GREEK SMALL LETTER EPSILON + 0x03f5: (0x03b5,), # 'ϵ': 'ε' + # CYRILLIC SMALL LETTER VE: CYRILLIC SMALL LETTER ROUNDED VE + 0x0432: (0x1c80,), # 'в': 'ᲀ' + # CYRILLIC SMALL LETTER DE: CYRILLIC SMALL LETTER LONG-LEGGED DE + 0x0434: (0x1c81,), # 'д': 'ᲁ' + # CYRILLIC SMALL LETTER O: CYRILLIC SMALL LETTER NARROW O + 0x043e: (0x1c82,), # 'о': 'ᲂ' + # CYRILLIC SMALL LETTER ES: CYRILLIC SMALL LETTER WIDE ES + 0x0441: (0x1c83,), # 'с': 'ᲃ' + # CYRILLIC SMALL LETTER TE: CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x0442: (0x1c84, 0x1c85), # 'т': 'ᲄᲅ' + # CYRILLIC SMALL LETTER HARD SIGN: CYRILLIC SMALL LETTER TALL HARD SIGN + 0x044a: (0x1c86,), # 'ъ': 'ᲆ' + # CYRILLIC SMALL LETTER YAT: CYRILLIC SMALL LETTER TALL YAT + 0x0463: (0x1c87,), # 'ѣ': 'ᲇ' + # CYRILLIC SMALL LETTER ROUNDED VE: CYRILLIC SMALL LETTER VE + 0x1c80: (0x0432,), # 'ᲀ': 'в' + # CYRILLIC SMALL LETTER LONG-LEGGED DE: CYRILLIC SMALL LETTER DE + 0x1c81: (0x0434,), # 'ᲁ': 'д' + # CYRILLIC SMALL LETTER NARROW O: CYRILLIC SMALL LETTER O + 0x1c82: (0x043e,), # 'ᲂ': 'о' + # CYRILLIC SMALL LETTER WIDE ES: CYRILLIC SMALL LETTER ES + 0x1c83: (0x0441,), # 'ᲃ': 'с' + # CYRILLIC SMALL LETTER TALL TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x1c84: (0x0442, 0x1c85), # 'ᲄ': 'тᲅ' + # CYRILLIC SMALL LETTER THREE-LEGGED TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE + 0x1c85: (0x0442, 0x1c84), # 'ᲅ': 'тᲄ' + # CYRILLIC SMALL LETTER TALL HARD SIGN: CYRILLIC SMALL LETTER HARD SIGN + 0x1c86: (0x044a,), # 'ᲆ': 'ъ' + # CYRILLIC SMALL LETTER TALL YAT: CYRILLIC SMALL LETTER YAT + 0x1c87: (0x0463,), # 'ᲇ': 'ѣ' + # CYRILLIC SMALL LETTER UNBLENDED UK: CYRILLIC SMALL LETTER MONOGRAPH UK + 0x1c88: (0xa64b,), # 'ᲈ': 'ꙋ' + # LATIN SMALL LETTER S WITH DOT ABOVE: LATIN SMALL LETTER LONG S WITH DOT ABOVE + 0x1e61: (0x1e9b,), # 'ṡ': 'ẛ' + # LATIN SMALL LETTER LONG S WITH DOT ABOVE: LATIN SMALL LETTER S WITH DOT ABOVE + 0x1e9b: (0x1e61,), # 'ẛ': 'ṡ' + # GREEK PROSGEGRAMMENI: COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA + 0x1fbe: (0x0345, 0x03b9), # 'ι': '\u0345ι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + 0x1fd3: (0x0390,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + 0x1fe3: (0x03b0,), # 'ΰ': 'ΰ' + # CYRILLIC SMALL LETTER MONOGRAPH UK: CYRILLIC SMALL LETTER UNBLENDED UK + 0xa64b: (0x1c88,), # 'ꙋ': 'ᲈ' + # LATIN SMALL LIGATURE LONG S T: LATIN SMALL LIGATURE ST + 0xfb05: (0xfb06,), # 'ſt': 'st' + # LATIN SMALL LIGATURE ST: LATIN SMALL LIGATURE LONG S T + 0xfb06: (0xfb05,), # 'st': 'ſt' +} diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 065f6fbd73244e..f621d04af123d4 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -13,6 +13,7 @@ import _sre from . import _parser from ._constants import * +from ._casefix import _EXTRA_CASES assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -27,62 +28,6 @@ POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } -# Sets of lowercase characters which have the same uppercase. -_equivalences = ( - # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I - (0x69, 0x131), # iı - # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S - (0x73, 0x17f), # sſ - # MICRO SIGN, GREEK SMALL LETTER MU - (0xb5, 0x3bc), # µμ - # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI - (0x345, 0x3b9, 0x1fbe), # \u0345ιι - # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA - (0x390, 0x1fd3), # ΐΐ - # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA - (0x3b0, 0x1fe3), # ΰΰ - # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL - (0x3b2, 0x3d0), # βϐ - # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL - (0x3b5, 0x3f5), # εϵ - # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL - (0x3b8, 0x3d1), # θϑ - # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL - (0x3ba, 0x3f0), # κϰ - # GREEK SMALL LETTER PI, GREEK PI SYMBOL - (0x3c0, 0x3d6), # πϖ - # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL - (0x3c1, 0x3f1), # ρϱ - # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA - (0x3c2, 0x3c3), # ςσ - # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL - (0x3c6, 0x3d5), # φϕ - # CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE - (0x432, 0x1c80), # вᲀ - # CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE - (0x434, 0x1c81), # дᲁ - # CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O - (0x43e, 0x1c82), # оᲂ - # CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES - (0x441, 0x1c83), # сᲃ - # CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE - (0x442, 0x1c84, 0x1c85), # тᲄᲅ - # CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN - (0x44a, 0x1c86), # ъᲆ - # CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT - (0x463, 0x1c87), # ѣᲇ - # CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK - (0x1c88, 0xa64b), # ᲈꙋ - # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE - (0x1e61, 0x1e9b), # ṡẛ - # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST - (0xfb05, 0xfb06), # ſtst -) - -# Maps the lowercase code to lowercase codes which have the same uppercase. -_ignorecase_fixes = {i: tuple(j for j in t if i != j) - for t in _equivalences for i in t} - class _CompileData: __slots__ = ('code', 'repeat_count') def __init__(self): @@ -111,7 +56,7 @@ def _compile(data, pattern, flags): if flags & SRE_FLAG_UNICODE: iscased = _sre.unicode_iscased tolower = _sre.unicode_tolower - fixes = _ignorecase_fixes + fixes = _EXTRA_CASES else: iscased = _sre.ascii_iscased tolower = _sre.ascii_tolower diff --git a/Makefile.pre.in b/Makefile.pre.in index 04a371ddff72fa..d9f821dd14e172 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -948,6 +948,12 @@ regen-test-frozenmain: $(BUILDPYTHON) # using Programs/freeze_test_frozenmain.py $(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Programs/freeze_test_frozenmain.py Programs/test_frozenmain.h +.PHONY: regen-re +regen-re: $(BUILDPYTHON) + # Regenerate Lib/re/_casefix.py + # using Tools/scripts/generate_re_casefix.py + $(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/scripts/generate_re_casefix.py $(srcdir)/Lib/re/_casefix.py + Programs/_testembed: Programs/_testembed.o $(LINK_PYTHON_DEPS) $(LINKCC) $(PY_CORE_LDFLAGS) $(LINKFORSHARED) -o $@ Programs/_testembed.o $(LINK_PYTHON_OBJS) $(LIBS) $(MODLIBS) $(SYSLIBS) diff --git a/Misc/NEWS.d/next/Tools-Demos/2022-04-18-12-52-16.gh-issue-91575.fK1TEh.rst b/Misc/NEWS.d/next/Tools-Demos/2022-04-18-12-52-16.gh-issue-91575.fK1TEh.rst new file mode 100644 index 00000000000000..3ed34226e070e6 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2022-04-18-12-52-16.gh-issue-91575.fK1TEh.rst @@ -0,0 +1,3 @@ +Add script ``Tools/scripts/generate_re_casefix.py`` and the make target +``regen-re`` for generating additional data for case-insensitive matching +according to the current Unicode version. diff --git a/Tools/scripts/generate_re_casefix.py b/Tools/scripts/generate_re_casefix.py new file mode 100755 index 00000000000000..00b048b5d716c3 --- /dev/null +++ b/Tools/scripts/generate_re_casefix.py @@ -0,0 +1,95 @@ +#! /usr/bin/env python3 +# This script generates Lib/re/_casefix.py. + +import collections +import re +import sys +import unicodedata + +def update_file(file, content): + try: + with open(file, 'r', encoding='utf-8') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w', encoding='utf-8') as fobj: + fobj.write(content) + return True + +re_casefix_template = """\ +# Auto-generated by Tools/scripts/generate_re_casefix.py. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { +%s +} +""" + +def uname(i): + return unicodedata.name(chr(i), r'U+%04X' % i) + +class hexint(int): + def __repr__(self): + return '%#06x' % self + +def alpha(i): + c = chr(i) + return c if c.isalpha() else ascii(c)[1:-1] + + +def main(outfile='Lib/re/_casefix.py'): + # Find sets of characters which have the same uppercase. + equivalent_chars = collections.defaultdict(str) + for c in map(chr, range(sys.maxunicode + 1)): + equivalent_chars[c.upper()] += c + equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1] + + # List of codes of lowercased characters which have the same uppercase. + equivalent_lower_codes = [sorted(t) + for s in equivalent_chars + for t in [set(ord(c.lower()) for c in s)] + if len(t) > 1] + + bad_codes = [] + for t in equivalent_lower_codes: + for i in t: + if i > 0xffff: + bad_codes.extend(t) + try: + bad_codes.append(ord(chr(i).upper())) + except (ValueError, TypeError): + pass + break + if bad_codes: + print('Case-insensitive matching may not work correctly for character:', + file=sys.stderr) + for i in sorted(bad_codes): + print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)), + file=sys.stderr) + sys.exit(1) + + mapping = {i: tuple(j for j in t if i != j) + for t in equivalent_lower_codes + for i in t} + + items = [] + for i, t in sorted(mapping.items()): + items.append(' # %s: %s' % ( + uname(i), + ', '.join(map(uname, t)), + )) + items.append(" %r: %r, # '%s': '%s'" % ( + hexint(i), + tuple(map(hexint, t)), + alpha(i), + ''.join(map(alpha, t)), + )) + + update_file(outfile, re_casefix_template % '\n'.join(items)) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:])