From 193d8388f606296ea86f30b108f9a39ab886e50b Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Wed, 15 Jun 2022 21:18:18 +0000 Subject: [PATCH 1/4] Revert "bpo-23689: re module, fix memory leak when a match is terminated by a signal or memory allocation failure (GH-32283)" This reverts commit 6e3eee5c11b539e9aab39cff783acf57838c355a. Manual fixups to increase the MAGIC number and to handle conflicts with a couple of changes that landed after that. --- Lib/re/_compiler.py | 97 ++++++++++++++++++++++-------------- Lib/re/_constants.py | 2 +- Lib/test/test_re.py | 28 +---------- Modules/_sre/clinic/sre.c.h | 25 +++------- Modules/_sre/sre.c | 65 ++++++++---------------- Modules/_sre/sre.h | 4 -- Modules/_sre/sre_constants.h | 2 +- Modules/_sre/sre_lib.h | 30 ++++++----- 8 files changed, 109 insertions(+), 144 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 4b5322338cbd5f..a9763f2e831784 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -28,11 +28,45 @@ POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } -class _CompileData: - __slots__ = ('code', 'repeat_count') - def __init__(self): - self.code = [] - self.repeat_count = 0 +# Sets of lowercase characters which have the same uppercase. +_equivalences = ( + # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I + (0x69, 0x131), # iı + # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S + (0x73, 0x17f), # sſ + # MICRO SIGN, GREEK SMALL LETTER MU + (0xb5, 0x3bc), # µμ + # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + (0x345, 0x3b9, 0x1fbe), # \u0345ιι + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + (0x390, 0x1fd3), # ΐΐ + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + (0x3b0, 0x1fe3), # ΰΰ + # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL + (0x3b2, 0x3d0), # βϐ + # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL + (0x3b5, 0x3f5), # εϵ + # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL + (0x3b8, 0x3d1), # θϑ + # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL + (0x3ba, 0x3f0), # κϰ + # GREEK SMALL LETTER PI, GREEK PI SYMBOL + (0x3c0, 0x3d6), # πϖ + # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL + (0x3c1, 0x3f1), # ρϱ + # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA + (0x3c2, 0x3c3), # ςσ + # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL + (0x3c6, 0x3d5), # φϕ + # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE + (0x1e61, 0x1e9b), # ṡẛ + # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST + (0xfb05, 0xfb06), # ſtst +) + +# Maps the lowercase code to lowercase codes which have the same uppercase. +_ignorecase_fixes = {i: tuple(j for j in t if i != j) + for t in _equivalences for i in t} def _combine_flags(flags, add_flags, del_flags, TYPE_FLAGS=_parser.TYPE_FLAGS): @@ -40,9 +74,8 @@ def _combine_flags(flags, add_flags, del_flags, flags &= ~TYPE_FLAGS return (flags | add_flags) & ~del_flags -def _compile(data, pattern, flags): +def _compile(code, pattern, flags): # internal: compile a (sub)pattern - code = data.code emit = code.append _len = len LITERAL_CODES = _LITERAL_CODES @@ -115,7 +148,7 @@ def _compile(data, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) - _compile(data, av[2], flags) + _compile(code, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip else: @@ -123,11 +156,7 @@ def _compile(data, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) - # now op is in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT) - if op != POSSESSIVE_REPEAT: - emit(data.repeat_count) - data.repeat_count += 1 - _compile(data, av[2], flags) + _compile(code, av[2], flags) code[skip] = _len(code) - skip emit(REPEATING_CODES[op][1]) elif op is SUBPATTERN: @@ -136,7 +165,7 @@ def _compile(data, pattern, flags): emit(MARK) emit((group-1)*2) # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) - _compile(data, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) if group: emit(MARK) emit((group-1)*2+1) @@ -148,7 +177,7 @@ def _compile(data, pattern, flags): # pop their stack if they reach it emit(ATOMIC_GROUP) skip = _len(code); emit(0) - _compile(data, av, flags) + _compile(code, av, flags) emit(SUCCESS) code[skip] = _len(code) - skip elif op in SUCCESS_CODES: @@ -163,7 +192,7 @@ def _compile(data, pattern, flags): if lo != hi: raise error("look-behind requires fixed-width pattern") emit(lo) # look behind - _compile(data, av[1], flags) + _compile(code, av[1], flags) emit(SUCCESS) code[skip] = _len(code) - skip elif op is AT: @@ -182,7 +211,7 @@ def _compile(data, pattern, flags): for av in av[1]: skip = _len(code); emit(0) # _compile_info(code, av, flags) - _compile(data, av, flags) + _compile(code, av, flags) emit(JUMP) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip @@ -210,12 +239,12 @@ def _compile(data, pattern, flags): emit(op) emit(av[0]-1) skipyes = _len(code); emit(0) - _compile(data, av[1], flags) + _compile(code, av[1], flags) if av[2]: emit(JUMP) skipno = _len(code); emit(0) code[skipyes] = _len(code) - skipyes + 1 - _compile(data, av[2], flags) + _compile(code, av[2], flags) code[skipno] = _len(code) - skipno else: code[skipyes] = _len(code) - skipyes + 1 @@ -582,17 +611,17 @@ def isstring(obj): def _code(p, flags): flags = p.state.flags | flags - data = _CompileData() + code = [] # compile info block - _compile_info(data.code, p, flags) + _compile_info(code, p, flags) # compile the pattern - _compile(data, p.data, flags) + _compile(code, p.data, flags) - data.code.append(SUCCESS) + code.append(SUCCESS) - return data + return code def _hex_code(code): return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) @@ -693,7 +722,7 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT_ONE, MIN_REPEAT_ONE, + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: @@ -701,13 +730,6 @@ def print_2(*args): print_(op, skip, min, max, to=i+skip) dis_(i+3, i+skip) i += skip - elif op is REPEAT: - skip, min, max, repeat_index = code[i: i+4] - if max == MAXREPEAT: - max = 'MAXREPEAT' - print_(op, skip, min, max, repeat_index, to=i+skip) - dis_(i+4, i+skip) - i += skip elif op is GROUPREF_EXISTS: arg, skip = code[i: i+2] print_(op, arg, skip, to=i+skip) @@ -762,11 +784,11 @@ def compile(p, flags=0): else: pattern = None - data = _code(p, flags) + code = _code(p, flags) if flags & SRE_FLAG_DEBUG: print() - dis(data.code) + dis(code) # map in either direction groupindex = p.state.groupdict @@ -775,6 +797,7 @@ def compile(p, flags=0): indexgroup[i] = k return _sre.compile( - pattern, flags | p.state.flags, data.code, - p.state.groups-1, groupindex, tuple(indexgroup), - data.repeat_count) + pattern, flags | p.state.flags, code, + p.state.groups-1, + groupindex, tuple(indexgroup) + ) diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 1cc85c631f22b4..10ee14bfab46ee 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220423 +MAGIC = 20220615 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 3752d734dbdef2..9f734d47c54499 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1765,12 +1765,9 @@ def test_dealloc(self): long_overflow = 2**128 self.assertRaises(TypeError, re.finditer, "a", {}) with self.assertRaises(OverflowError): - _sre.compile("abc", 0, [long_overflow], 0, {}, (), 0) + _sre.compile("abc", 0, [long_overflow], 0, {}, ()) with self.assertRaises(TypeError): - _sre.compile({}, 0, [], 0, [], [], 0) - with self.assertRaises(RuntimeError): - # invalid repeat_count -1 - _sre.compile("abc", 0, [1], 0, {}, (), -1) + _sre.compile({}, 0, [], 0, [], []) def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) @@ -2509,27 +2506,6 @@ def test_possesive_repeat(self): 14. SUCCESS ''') - def test_repeat_index(self): - self.assertEqual(get_debug_out(r'(?:ab)*?(?:cd)*'), '''\ -MIN_REPEAT 0 MAXREPEAT - LITERAL 97 - LITERAL 98 -MAX_REPEAT 0 MAXREPEAT - LITERAL 99 - LITERAL 100 - - 0. INFO 4 0b0 0 MAXREPEAT (to 5) - 5: REPEAT 8 0 MAXREPEAT 0 (to 14) -10. LITERAL 0x61 ('a') -12. LITERAL 0x62 ('b') -14: MIN_UNTIL -15. REPEAT 8 0 MAXREPEAT 1 (to 24) -20. LITERAL 0x63 ('c') -22. LITERAL 0x64 ('d') -24: MAX_UNTIL -25. SUCCESS -''') - class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index e243c756e1f971..6780fb9424d2fe 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -764,7 +764,7 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__, PyDoc_STRVAR(_sre_compile__doc__, "compile($module, /, pattern, flags, code, groups, groupindex,\n" -" indexgroup, repeat_count)\n" +" indexgroup)\n" "--\n" "\n"); @@ -774,24 +774,23 @@ PyDoc_STRVAR(_sre_compile__doc__, static PyObject * _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, PyObject *code, Py_ssize_t groups, PyObject *groupindex, - PyObject *indexgroup, Py_ssize_t repeat_count); + PyObject *indexgroup); static PyObject * _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"pattern", "flags", "code", "groups", "groupindex", "indexgroup", "repeat_count", NULL}; + static const char * const _keywords[] = {"pattern", "flags", "code", "groups", "groupindex", "indexgroup", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "compile", 0}; - PyObject *argsbuf[7]; + PyObject *argsbuf[6]; PyObject *pattern; int flags; PyObject *code; Py_ssize_t groups; PyObject *groupindex; PyObject *indexgroup; - Py_ssize_t repeat_count; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 7, 7, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 6, 6, 0, argsbuf); if (!args) { goto exit; } @@ -827,19 +826,7 @@ _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject goto exit; } indexgroup = args[5]; - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[6]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); - } - if (ival == -1 && PyErr_Occurred()) { - goto exit; - } - repeat_count = ival; - } - return_value = _sre_compile_impl(module, pattern, flags, code, groups, groupindex, indexgroup, repeat_count); + return_value = _sre_compile_impl(module, pattern, flags, code, groups, groupindex, indexgroup); exit: return return_value; diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index a1da180892fb4a..bcb30848d9a592 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -427,12 +427,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; - state->repeats_array = PyMem_New(SRE_REPEAT, pattern->repeat_count); - if (!state->repeats_array) { - PyErr_NoMemory(); - goto err; - } - state->buffer.buf = NULL; ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer); if (!ptr) @@ -482,9 +476,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, safely casted to `void*`, see bpo-39943 for details. */ PyMem_Free((void*) state->mark); state->mark = NULL; - PyMem_Free(state->repeats_array); - state->repeats_array = NULL; - if (state->buffer.buf) PyBuffer_Release(&state->buffer); return NULL; @@ -500,8 +491,6 @@ state_fini(SRE_STATE* state) /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); state->mark = NULL; - PyMem_Free(state->repeats_array); - state->repeats_array = NULL; } /* calculate offset from start of string */ @@ -1408,15 +1397,14 @@ _sre.compile groups: Py_ssize_t groupindex: object(subclass_of='&PyDict_Type') indexgroup: object(subclass_of='&PyTuple_Type') - repeat_count: Py_ssize_t [clinic start generated code]*/ static PyObject * _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, PyObject *code, Py_ssize_t groups, PyObject *groupindex, - PyObject *indexgroup, Py_ssize_t repeat_count) -/*[clinic end generated code: output=922af562d51b1657 input=77e39c322501ec2a]*/ + PyObject *indexgroup) +/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/ { /* "compile" pattern descriptor to pattern object */ @@ -1474,8 +1462,8 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, self->pattern = pattern; self->flags = flags; + self->groups = groups; - self->repeat_count = repeat_count; if (PyDict_GET_SIZE(groupindex) > 0) { Py_INCREF(groupindex); @@ -1647,7 +1635,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) } static int -_validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) +_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) { /* Some variables are manipulated by the macros above */ SRE_CODE op; @@ -1668,8 +1656,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) sre_match() code is robust even if they don't, and the worst you can get is nonsensical match results. */ GET_ARG; - if (arg > 2 * (size_t)self->groups + 1) { - VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)self->groups)); + if (arg > 2 * (size_t)groups + 1) { + VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); FAIL; } break; @@ -1798,7 +1786,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) if (skip == 0) break; /* Stop 2 before the end; we check the JUMP below */ - if (!_validate_inner(code, code+skip-3, self)) + if (!_validate_inner(code, code+skip-3, groups)) FAIL; code += skip-3; /* Check that it ends with a JUMP, and that each JUMP @@ -1827,7 +1815,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-4, self)) + if (!_validate_inner(code, code+skip-4, groups)) FAIL; code += skip-4; GET_OP; @@ -1839,7 +1827,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) case SRE_OP_REPEAT: case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE op1 = op, min, max, repeat_index; + SRE_CODE op1 = op, min, max; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1847,17 +1835,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (op1 == SRE_OP_REPEAT) { - GET_ARG; repeat_index = arg; - if (repeat_index >= (size_t)self->repeat_count) - FAIL; - skip -= 4; - } else { - skip -= 3; - } - if (!_validate_inner(code, code+skip, self)) + if (!_validate_inner(code, code+skip-3, groups)) FAIL; - code += skip; + code += skip-3; GET_OP; if (op1 == SRE_OP_POSSESSIVE_REPEAT) { if (op != SRE_OP_SUCCESS) @@ -1873,7 +1853,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) case SRE_OP_ATOMIC_GROUP: { GET_SKIP; - if (!_validate_inner(code, code+skip-2, self)) + if (!_validate_inner(code, code+skip-2, groups)) FAIL; code += skip-2; GET_OP; @@ -1887,7 +1867,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) case SRE_OP_GROUPREF_UNI_IGNORE: case SRE_OP_GROUPREF_LOC_IGNORE: GET_ARG; - if (arg >= (size_t)self->groups) + if (arg >= (size_t)groups) FAIL; break; @@ -1896,7 +1876,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) 'group' is either an integer group number or a group name, 'then' and 'else' are sub-regexes, and 'else' is optional. */ GET_ARG; - if (arg >= (size_t)self->groups) + if (arg >= (size_t)groups) FAIL; GET_SKIP_ADJ(1); code--; /* The skip is relative to the first arg! */ @@ -1929,17 +1909,17 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) code[skip-3] == SRE_OP_JUMP) { VTRACE(("both then and else parts present\n")); - if (!_validate_inner(code+1, code+skip-3, self)) + if (!_validate_inner(code+1, code+skip-3, groups)) FAIL; code += skip-2; /* Position after JUMP, at */ GET_SKIP; - if (!_validate_inner(code, code+skip-1, self)) + if (!_validate_inner(code, code+skip-1, groups)) FAIL; code += skip-1; } else { VTRACE(("only a then part present\n")); - if (!_validate_inner(code+1, code+skip-1, self)) + if (!_validate_inner(code+1, code+skip-1, groups)) FAIL; code += skip-1; } @@ -1953,7 +1933,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) if (arg & 0x80000000) FAIL; /* Width too large */ /* Stop 1 before the end; we check the SUCCESS below */ - if (!_validate_inner(code+1, code+skip-2, self)) + if (!_validate_inner(code+1, code+skip-2, groups)) FAIL; code += skip-2; GET_OP; @@ -1972,19 +1952,18 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) } static int -_validate_outer(SRE_CODE *code, SRE_CODE *end, PatternObject *self) +_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) { - if (self->groups < 0 || (size_t)self->groups > SRE_MAXGROUPS || - self->repeat_count < 0 || + if (groups < 0 || (size_t)groups > SRE_MAXGROUPS || code >= end || end[-1] != SRE_OP_SUCCESS) FAIL; - return _validate_inner(code, end-1, self); + return _validate_inner(code, end-1, groups); } static int _validate(PatternObject *self) { - if (!_validate_outer(self->code, self->code+self->codesize, self)) + if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) { PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); return 0; diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index aff064d343ec4c..52ae3e11b5f750 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -29,8 +29,6 @@ typedef struct { Py_ssize_t groups; /* must be first! */ PyObject* groupindex; /* dict */ PyObject* indexgroup; /* tuple */ - /* the number of REPEATs */ - Py_ssize_t repeat_count; /* compatibility */ PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ @@ -85,8 +83,6 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; - /* repeat contexts array */ - SRE_REPEAT *repeats_array; } SRE_STATE; typedef struct { diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h index 590d5be7cb4d94..c6335147368626 100644 --- a/Modules/_sre/sre_constants.h +++ b/Modules/_sre/sre_constants.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220423 +#define SRE_MAGIC 20220615 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 1e5b50170ae76e..fb4c18b63d643d 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -1079,12 +1079,17 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> <3=repeat_index> item tail */ - TRACE(("|%p|%p|REPEAT %d %d %d\n", pattern, ptr, - pattern[1], pattern[2], pattern[3])); - - /* install repeat context */ - ctx->u.rep = &state->repeats_array[pattern[3]]; + TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr, + pattern[1], pattern[2])); + /* install new repeat context */ + /* TODO(https://github.com/python/cpython/issues/67877): Fix this + * potential memory leak. */ + ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); + if (!ctx->u.rep) { + PyErr_NoMemory(); + RETURN_FAILURE; + } ctx->u.rep->count = -1; ctx->u.rep->pattern = pattern; ctx->u.rep->prev = state->repeat; @@ -1094,6 +1099,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]); state->repeat = ctx->u.rep->prev; + PyObject_Free(ctx->u.rep); if (ret) { RETURN_ON_ERROR(ret); @@ -1103,8 +1109,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) TARGET(SRE_OP_MAX_UNTIL): /* maximizing repeat */ - /* <1=min> <2=max> - <3=repeat_index> item tail */ + /* <1=min> <2=max> item tail */ /* FIXME: we probably need to deal with zero-width matches in here... */ @@ -1124,7 +1129,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+4); + ctx->u.rep->pattern+3); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1146,7 +1151,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+4); + ctx->u.rep->pattern+3); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { MARK_POP_DISCARD(ctx->lastmark); @@ -1171,8 +1176,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) TARGET(SRE_OP_MIN_UNTIL): /* minimizing repeat */ - /* <1=min> <2=max> - <3=repeat_index> item tail */ + /* <1=min> <2=max> item tail */ ctx->u.rep = state->repeat; if (!ctx->u.rep) @@ -1189,7 +1193,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+4); + ctx->u.rep->pattern+3); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1232,7 +1236,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+4); + ctx->u.rep->pattern+3); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { RETURN_ON_ERROR(ret); From 53708033600826e98872ac0e7a239558124e5c4f Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Wed, 15 Jun 2022 21:22:47 +0000 Subject: [PATCH 2/4] Fix clinic marker. --- Modules/_sre/clinic/sre.c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index 6780fb9424d2fe..048a494f1bc7c6 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -1116,4 +1116,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const } return _sre_SRE_Scanner_search_impl(self, cls); } -/*[clinic end generated code: output=97e7ce058366760b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=fd2f45c941620e6e input=a9049054013a1b77]*/ From a31366c819fefeb7a1dc6909e4af7198921d9e1c Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Wed, 15 Jun 2022 21:36:00 +0000 Subject: [PATCH 3/4] Add a news entry. --- .../next/Library/2022-06-15-21-35-11.gh-issue-91404.39TZzW.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2022-06-15-21-35-11.gh-issue-91404.39TZzW.rst diff --git a/Misc/NEWS.d/next/Library/2022-06-15-21-35-11.gh-issue-91404.39TZzW.rst b/Misc/NEWS.d/next/Library/2022-06-15-21-35-11.gh-issue-91404.39TZzW.rst new file mode 100644 index 00000000000000..e20b15c7b75864 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-06-15-21-35-11.gh-issue-91404.39TZzW.rst @@ -0,0 +1,3 @@ +Revert the :mod:`re` memory leak when a match is terminated by a signal or +memory allocation failure as the implemented fix caused a major performance +regression. From ff8323d60024fdef1eafcddfd424710a22980845 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Wed, 15 Jun 2022 21:41:43 +0000 Subject: [PATCH 4/4] remove more rollback manual merge mixup. these were removed in https://github.com/python/cpython/commit/f912cc0e413f667a8cc257a41775272bc641b0d8 --- Lib/re/_compiler.py | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index a9763f2e831784..d8e0d2fdefdcca 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -28,46 +28,6 @@ POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } -# Sets of lowercase characters which have the same uppercase. -_equivalences = ( - # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I - (0x69, 0x131), # iı - # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S - (0x73, 0x17f), # sſ - # MICRO SIGN, GREEK SMALL LETTER MU - (0xb5, 0x3bc), # µμ - # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI - (0x345, 0x3b9, 0x1fbe), # \u0345ιι - # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA - (0x390, 0x1fd3), # ΐΐ - # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA - (0x3b0, 0x1fe3), # ΰΰ - # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL - (0x3b2, 0x3d0), # βϐ - # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL - (0x3b5, 0x3f5), # εϵ - # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL - (0x3b8, 0x3d1), # θϑ - # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL - (0x3ba, 0x3f0), # κϰ - # GREEK SMALL LETTER PI, GREEK PI SYMBOL - (0x3c0, 0x3d6), # πϖ - # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL - (0x3c1, 0x3f1), # ρϱ - # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA - (0x3c2, 0x3c3), # ςσ - # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL - (0x3c6, 0x3d5), # φϕ - # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE - (0x1e61, 0x1e9b), # ṡẛ - # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST - (0xfb05, 0xfb06), # ſtst -) - -# Maps the lowercase code to lowercase codes which have the same uppercase. -_ignorecase_fixes = {i: tuple(j for j in t if i != j) - for t in _equivalences for i in t} - def _combine_flags(flags, add_flags, del_flags, TYPE_FLAGS=_parser.TYPE_FLAGS): if add_flags & TYPE_FLAGS: