From e5317fe2fd80a8fbafd8420d1a9013276b89678f Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:19:52 +0800 Subject: [PATCH 01/11] 1. add variables --- Lib/re/_constants.py | 2 +- .../next/Library/2022-04-03-13-19-08.bpo-23689.TFSc3E.rst | 2 ++ Modules/sre.h | 4 ++++ Modules/sre_constants.h | 2 +- 4 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-03-13-19-08.bpo-23689.TFSc3E.rst diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index c735edfea1f13d..5317fd53e9c5a6 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220318 +MAGIC = 20220402 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Misc/NEWS.d/next/Library/2022-04-03-13-19-08.bpo-23689.TFSc3E.rst b/Misc/NEWS.d/next/Library/2022-04-03-13-19-08.bpo-23689.TFSc3E.rst new file mode 100644 index 00000000000000..1032087d9b850a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-03-13-19-08.bpo-23689.TFSc3E.rst @@ -0,0 +1,2 @@ +:mod:`re` module: fix memory leak when a match is terminated by a signal or +memory allocation failure. Patch by Ma Lin. diff --git a/Modules/sre.h b/Modules/sre.h index 785adbd003e7fd..e2c5277aefb5d9 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -29,6 +29,8 @@ typedef struct { Py_ssize_t groups; /* must be first! */ PyObject* groupindex; /* dict */ PyObject* indexgroup; /* tuple */ + /* the number of REPEATs */ + Py_ssize_t repeat_count; /* compatibility */ PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ @@ -83,6 +85,8 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; + /* repeat contexts array */ + SRE_REPEAT *repeats_array; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 45395dcea807a3..8b249493bd5cd9 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220318 +#define SRE_MAGIC 20220402 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 From 28e4d2d97f39897ad437683ee2710573579fd665 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:25:06 +0800 Subject: [PATCH 02/11] 2. add _CompileData class to _compile.py _CompileData can store intermediate data. --- Lib/re/_compiler.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 62da8e55d72abd..4bd87ed4087f8e 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -67,14 +67,21 @@ _ignorecase_fixes = {i: tuple(j for j in t if i != j) for t in _equivalences for i in t} +class _CompileData: + __slots__ = ('code', 'repeat_count') + def __init__(self): + self.code = [] + self.repeat_count = 0 + def _combine_flags(flags, add_flags, del_flags, TYPE_FLAGS=_parser.TYPE_FLAGS): if add_flags & TYPE_FLAGS: flags &= ~TYPE_FLAGS return (flags | add_flags) & ~del_flags -def _compile(code, pattern, flags): +def _compile(data, pattern, flags): # internal: compile a (sub)pattern + code = data.code emit = code.append _len = len LITERAL_CODES = _LITERAL_CODES @@ -147,7 +154,7 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) - _compile(code, av[2], flags) + _compile(data, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip else: @@ -155,7 +162,7 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) - _compile(code, av[2], flags) + _compile(data, av[2], flags) code[skip] = _len(code) - skip emit(REPEATING_CODES[op][1]) elif op is SUBPATTERN: @@ -164,7 +171,7 @@ def _compile(code, pattern, flags): emit(MARK) emit((group-1)*2) # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) - _compile(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(data, p, _combine_flags(flags, add_flags, del_flags)) if group: emit(MARK) emit((group-1)*2+1) @@ -176,7 +183,7 @@ def _compile(code, pattern, flags): # pop their stack if they reach it emit(ATOMIC_GROUP) skip = _len(code); emit(0) - _compile(code, av, flags) + _compile(data, av, flags) emit(SUCCESS) code[skip] = _len(code) - skip elif op in SUCCESS_CODES: @@ -191,13 +198,13 @@ def _compile(code, pattern, flags): if lo != hi: raise error("look-behind requires fixed-width pattern") emit(lo) # look behind - _compile(code, av[1], flags) + _compile(data, av[1], flags) emit(SUCCESS) code[skip] = _len(code) - skip elif op is CALL: emit(op) skip = _len(code); emit(0) - _compile(code, av, flags) + _compile(data, av, flags) emit(SUCCESS) code[skip] = _len(code) - skip elif op is AT: @@ -216,7 +223,7 @@ def _compile(code, pattern, flags): for av in av[1]: skip = _len(code); emit(0) # _compile_info(code, av, flags) - _compile(code, av, flags) + _compile(data, av, flags) emit(JUMP) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip @@ -244,12 +251,12 @@ def _compile(code, pattern, flags): emit(op) emit(av[0]-1) skipyes = _len(code); emit(0) - _compile(code, av[1], flags) + _compile(data, av[1], flags) if av[2]: emit(JUMP) skipno = _len(code); emit(0) code[skipyes] = _len(code) - skipyes + 1 - _compile(code, av[2], flags) + _compile(data, av[2], flags) code[skipno] = _len(code) - skipno else: code[skipyes] = _len(code) - skipyes + 1 @@ -608,17 +615,17 @@ def isstring(obj): def _code(p, flags): flags = p.state.flags | flags - code = [] + data = _CompileData() # compile info block - _compile_info(code, p, flags) + _compile_info(data.code, p, flags) # compile the pattern - _compile(code, p.data, flags) + _compile(data, p.data, flags) - code.append(SUCCESS) + data.code.append(SUCCESS) - return code + return data def _hex_code(code): return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) @@ -781,11 +788,11 @@ def compile(p, flags=0): else: pattern = None - code = _code(p, flags) + data = _code(p, flags) if flags & SRE_FLAG_DEBUG: print() - dis(code) + dis(data.code) # map in either direction groupindex = p.state.groupdict @@ -794,7 +801,7 @@ def compile(p, flags=0): indexgroup[i] = k return _sre.compile( - pattern, flags | p.state.flags, code, + pattern, flags | p.state.flags, data.code, p.state.groups-1, groupindex, tuple(indexgroup) ) From 7493fad15987bad1de1da48c192877b912291555 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:26:39 +0800 Subject: [PATCH 03/11] 3. add repeat_count parameter to _sre.compile() function argument clinic --- Modules/_sre.c | 5 +++-- Modules/clinic/_sre.c.h | 27 ++++++++++++++++++++------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 48193f82475a42..513c10d8efa554 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1407,14 +1407,15 @@ _sre.compile groups: Py_ssize_t groupindex: object(subclass_of='&PyDict_Type') indexgroup: object(subclass_of='&PyTuple_Type') + repeat_count: Py_ssize_t [clinic start generated code]*/ static PyObject * _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, PyObject *code, Py_ssize_t groups, PyObject *groupindex, - PyObject *indexgroup) -/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/ + PyObject *indexgroup, Py_ssize_t repeat_count) +/*[clinic end generated code: output=922af562d51b1657 input=77e39c322501ec2a]*/ { /* "compile" pattern descriptor to pattern object */ diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h index 72d772c289ae8b..34cbe21f14071b 100644 --- a/Modules/clinic/_sre.c.h +++ b/Modules/clinic/_sre.c.h @@ -544,7 +544,7 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__, PyDoc_STRVAR(_sre_compile__doc__, "compile($module, /, pattern, flags, code, groups, groupindex,\n" -" indexgroup)\n" +" indexgroup, repeat_count)\n" "--\n" "\n"); @@ -554,23 +554,24 @@ PyDoc_STRVAR(_sre_compile__doc__, static PyObject * _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, PyObject *code, Py_ssize_t groups, PyObject *groupindex, - PyObject *indexgroup); + PyObject *indexgroup, Py_ssize_t repeat_count); static PyObject * _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"pattern", "flags", "code", "groups", "groupindex", "indexgroup", NULL}; + static const char * const _keywords[] = {"pattern", "flags", "code", "groups", "groupindex", "indexgroup", "repeat_count", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "compile", 0}; - PyObject *argsbuf[6]; + PyObject *argsbuf[7]; PyObject *pattern; int flags; PyObject *code; Py_ssize_t groups; PyObject *groupindex; PyObject *indexgroup; + Py_ssize_t repeat_count; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 6, 6, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 7, 7, 0, argsbuf); if (!args) { goto exit; } @@ -606,7 +607,19 @@ _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject goto exit; } indexgroup = args[5]; - return_value = _sre_compile_impl(module, pattern, flags, code, groups, groupindex, indexgroup); + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[6]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + repeat_count = ival; + } + return_value = _sre_compile_impl(module, pattern, flags, code, groups, groupindex, indexgroup, repeat_count); exit: return return_value; @@ -910,4 +923,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const exit: return return_value; } -/*[clinic end generated code: output=518f7bb775c1184f input=a9049054013a1b77]*/ +/*[clinic end generated code: output=9d7510a57a157a38 input=a9049054013a1b77]*/ From b928335409c967ce2a07a362eaad88c4a0662515 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:29:31 +0800 Subject: [PATCH 04/11] 4. emit repeat_count in sre_compile.py --- Lib/re/_compiler.py | 18 ++++++++++++++---- Lib/test/test_re.py | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 4bd87ed4087f8e..bedd4b8f400164 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -162,6 +162,10 @@ def _compile(data, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) + # now op is in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT) + if op != POSSESSIVE_REPEAT: + emit(data.repeat_count) + data.repeat_count += 1 _compile(data, av[2], flags) code[skip] = _len(code) - skip emit(REPEATING_CODES[op][1]) @@ -726,7 +730,7 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + elif op in (REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: @@ -734,6 +738,13 @@ def print_2(*args): print_(op, skip, min, max, to=i+skip) dis_(i+3, i+skip) i += skip + elif op is REPEAT: + skip, min, max, repeat_index = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, repeat_index, to=i+skip) + dis_(i+4, i+skip) + i += skip elif op is GROUPREF_EXISTS: arg, skip = code[i: i+2] print_(op, arg, skip, to=i+skip) @@ -802,6 +813,5 @@ def compile(p, flags=0): return _sre.compile( pattern, flags | p.state.flags, data.code, - p.state.groups-1, - groupindex, tuple(indexgroup) - ) + p.state.groups-1, groupindex, tuple(indexgroup), + data.repeat_count) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index f1e5af452d8e06..565ea54ba0a9c8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1643,9 +1643,9 @@ def test_dealloc(self): long_overflow = 2**128 self.assertRaises(TypeError, re.finditer, "a", {}) with self.assertRaises(OverflowError): - _sre.compile("abc", 0, [long_overflow], 0, {}, ()) + _sre.compile("abc", 0, [long_overflow], 0, {}, (), 0) with self.assertRaises(TypeError): - _sre.compile({}, 0, [], 0, [], []) + _sre.compile({}, 0, [], 0, [], [], 0) def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) From dfec05de5ca6e68303f2c6b6cbca5c34fc772219 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:34:18 +0800 Subject: [PATCH 05/11] 5. change _validate_outer() parameter -_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) +_validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) --- Modules/_sre.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 513c10d8efa554..a04b168d55bc23 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1646,7 +1646,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) } static int -_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) +_validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) { /* Some variables are manipulated by the macros above */ SRE_CODE op; @@ -1667,8 +1667,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) sre_match() code is robust even if they don't, and the worst you can get is nonsensical match results. */ GET_ARG; - if (arg > 2 * (size_t)groups + 1) { - VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); + if (arg > 2 * (size_t)self->groups + 1) { + VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)self->groups)); FAIL; } break; @@ -1797,7 +1797,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) if (skip == 0) break; /* Stop 2 before the end; we check the JUMP below */ - if (!_validate_inner(code, code+skip-3, groups)) + if (!_validate_inner(code, code+skip-3, self)) FAIL; code += skip-3; /* Check that it ends with a JUMP, and that each JUMP @@ -1826,7 +1826,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-4, groups)) + if (!_validate_inner(code, code+skip-4, self)) FAIL; code += skip-4; GET_OP; @@ -1846,7 +1846,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-3, groups)) + if (!_validate_inner(code, code+skip-3, self)) FAIL; code += skip-3; GET_OP; @@ -1864,7 +1864,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_ATOMIC_GROUP: { GET_SKIP; - if (!_validate_inner(code, code+skip-2, groups)) + if (!_validate_inner(code, code+skip-2, self)) FAIL; code += skip-2; GET_OP; @@ -1878,7 +1878,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_GROUPREF_UNI_IGNORE: case SRE_OP_GROUPREF_LOC_IGNORE: GET_ARG; - if (arg >= (size_t)groups) + if (arg >= (size_t)self->groups) FAIL; break; @@ -1887,7 +1887,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) 'group' is either an integer group number or a group name, 'then' and 'else' are sub-regexes, and 'else' is optional. */ GET_ARG; - if (arg >= (size_t)groups) + if (arg >= (size_t)self->groups) FAIL; GET_SKIP_ADJ(1); code--; /* The skip is relative to the first arg! */ @@ -1920,17 +1920,17 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) code[skip-3] == SRE_OP_JUMP) { VTRACE(("both then and else parts present\n")); - if (!_validate_inner(code+1, code+skip-3, groups)) + if (!_validate_inner(code+1, code+skip-3, self)) FAIL; code += skip-2; /* Position after JUMP, at */ GET_SKIP; - if (!_validate_inner(code, code+skip-1, groups)) + if (!_validate_inner(code, code+skip-1, self)) FAIL; code += skip-1; } else { VTRACE(("only a then part present\n")); - if (!_validate_inner(code+1, code+skip-1, groups)) + if (!_validate_inner(code+1, code+skip-1, self)) FAIL; code += skip-1; } @@ -1944,7 +1944,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) if (arg & 0x80000000) FAIL; /* Width too large */ /* Stop 1 before the end; we check the SUCCESS below */ - if (!_validate_inner(code+1, code+skip-2, groups)) + if (!_validate_inner(code+1, code+skip-2, self)) FAIL; code += skip-2; GET_OP; @@ -1963,18 +1963,18 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) } static int -_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) +_validate_outer(SRE_CODE *code, SRE_CODE *end, PatternObject *self) { - if (groups < 0 || (size_t)groups > SRE_MAXGROUPS || + if (self->groups < 0 || (size_t)self->groups > SRE_MAXGROUPS || code >= end || end[-1] != SRE_OP_SUCCESS) FAIL; - return _validate_inner(code, end-1, groups); + return _validate_inner(code, end-1, self); } static int _validate(PatternObject *self) { - if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) + if (!_validate_outer(self->code, self->code+self->codesize, self)) { PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); return 0; From db7e88b2af63d60b3bab3728e0a72a280331d02c Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:36:54 +0800 Subject: [PATCH 06/11] 6. validate in _validate_inner() / _validate_outer() --- Modules/_sre.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index a04b168d55bc23..a89af5a2e0380c 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1473,8 +1473,8 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, self->pattern = pattern; self->flags = flags; - self->groups = groups; + self->repeat_count = repeat_count; if (PyDict_GET_SIZE(groupindex) > 0) { Py_INCREF(groupindex); @@ -1838,7 +1838,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) case SRE_OP_REPEAT: case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE op1 = op, min, max; + SRE_CODE op1 = op, min, max, repeat_index, _fields; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1846,9 +1846,17 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-3, self)) + if (op1 == SRE_OP_REPEAT) { + GET_ARG; repeat_index = arg; + if (repeat_index >= (size_t)self->repeat_count) + FAIL; + _fields = 4; + } else { + _fields = 3; + } + if (!_validate_inner(code, code+skip-_fields, self)) FAIL; - code += skip-3; + code += skip-_fields; GET_OP; if (op1 == SRE_OP_POSSESSIVE_REPEAT) { if (op != SRE_OP_SUCCESS) @@ -1966,6 +1974,7 @@ static int _validate_outer(SRE_CODE *code, SRE_CODE *end, PatternObject *self) { if (self->groups < 0 || (size_t)self->groups > SRE_MAXGROUPS || + self->repeat_count < 0 || code >= end || end[-1] != SRE_OP_SUCCESS) FAIL; return _validate_inner(code, end-1, self); From a43cb6e2d5125a87753467d52180585eb968f93b Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:39:40 +0800 Subject: [PATCH 07/11] 7. allocate repeats_array for SRE_STATE --- Modules/_sre.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Modules/_sre.c b/Modules/_sre.c index a89af5a2e0380c..9314fabb0d3c21 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -427,6 +427,12 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; + state->repeats_array = PyMem_New(SRE_REPEAT, pattern->repeat_count); + if (!state->repeats_array) { + PyErr_NoMemory(); + goto err; + } + state->buffer.buf = NULL; ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer); if (!ptr) @@ -476,6 +482,9 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, safely casted to `void*`, see bpo-39943 for details. */ PyMem_Free((void*) state->mark); state->mark = NULL; + PyMem_Free(state->repeats_array); + state->repeats_array = NULL; + if (state->buffer.buf) PyBuffer_Release(&state->buffer); return NULL; @@ -491,6 +500,8 @@ state_fini(SRE_STATE* state) /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); state->mark = NULL; + PyMem_Free(state->repeats_array); + state->repeats_array = NULL; } /* calculate offset from start of string */ From 1560a0cc5977bd12fbb8bae4e7a2210fa3d4073d Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:41:25 +0800 Subject: [PATCH 08/11] 8. support code in sre_lib.h --- Modules/sre_lib.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 8e4e714eada389..1cc926d956c63f 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1032,16 +1032,14 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); + /* <1=min> <2=max> + <3=repeat_index> item tail */ + TRACE(("|%p|%p|REPEAT %d %d %d\n", ctx->pattern, ctx->ptr, + ctx->pattern[1], ctx->pattern[2], ctx->pattern[3])); + + /* install repeat context */ + ctx->u.rep = &state->repeats_array[ctx->pattern[3]]; - /* install new repeat context */ - ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } ctx->u.rep->count = -1; ctx->u.rep->pattern = ctx->pattern; ctx->u.rep->prev = state->repeat; @@ -1051,7 +1049,6 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); state->repeat = ctx->u.rep->prev; - PyObject_Free(ctx->u.rep); if (ret) { RETURN_ON_ERROR(ret); @@ -1061,7 +1058,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MAX_UNTIL: /* maximizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ /* FIXME: we probably need to deal with zero-width matches in here... */ @@ -1081,7 +1079,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1103,7 +1101,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { MARK_POP_DISCARD(ctx->lastmark); @@ -1128,7 +1126,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MIN_UNTIL: /* minimizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ ctx->u.rep = state->repeat; if (!ctx->u.rep) @@ -1145,7 +1144,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1188,7 +1187,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { RETURN_ON_ERROR(ret); From 96be0259a7c512c20f14e7fb8234fc7931fdb526 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 13:48:26 +0800 Subject: [PATCH 09/11] 9. add unit-tests --- Lib/test/test_re.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 565ea54ba0a9c8..1f25f52302fbff 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1646,6 +1646,9 @@ def test_dealloc(self): _sre.compile("abc", 0, [long_overflow], 0, {}, (), 0) with self.assertRaises(TypeError): _sre.compile({}, 0, [], 0, [], [], 0) + with self.assertRaises(RuntimeError): + # invalid repeat_count -1 + _sre.compile("abc", 0, [1], 0, {}, (), -1) def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) @@ -2334,6 +2337,27 @@ def test_possesive_repeat(self): 14. SUCCESS ''') + def test_repeat_index(self): + self.assertEqual(get_debug_out(r'(?:ab)*(?:cd)*'), '''\ +MAX_REPEAT 0 MAXREPEAT + LITERAL 97 + LITERAL 98 +MAX_REPEAT 0 MAXREPEAT + LITERAL 99 + LITERAL 100 + + 0. INFO 4 0b0 0 MAXREPEAT (to 5) + 5: REPEAT 8 0 MAXREPEAT 0 (to 14) +10. LITERAL 0x61 ('a') +12. LITERAL 0x62 ('b') +14: MAX_UNTIL +15. REPEAT 8 0 MAXREPEAT 1 (to 24) +20. LITERAL 0x63 ('c') +22. LITERAL 0x64 ('d') +24: MAX_UNTIL +25. SUCCESS +''') + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): From 7ee3e669b08a84721e036b2f22256abc3f77521b Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 21:26:29 +0800 Subject: [PATCH 10/11] a. skip -= field_number --- Modules/_sre.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 9314fabb0d3c21..506363d6fbf6d7 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1849,7 +1849,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) case SRE_OP_REPEAT: case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE op1 = op, min, max, repeat_index, _fields; + SRE_CODE op1 = op, min, max, repeat_index; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1861,13 +1861,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, PatternObject *self) GET_ARG; repeat_index = arg; if (repeat_index >= (size_t)self->repeat_count) FAIL; - _fields = 4; + skip -= 4; } else { - _fields = 3; + skip -= 3; } - if (!_validate_inner(code, code+skip-_fields, self)) + if (!_validate_inner(code, code+skip, self)) FAIL; - code += skip-_fields; + code += skip; GET_OP; if (op1 == SRE_OP_POSSESSIVE_REPEAT) { if (op != SRE_OP_SUCCESS) From e95b19f63d36ede3525303e5363738d206083d47 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sun, 3 Apr 2022 21:28:35 +0800 Subject: [PATCH 11/11] b. improve unit-test --- Lib/test/test_re.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1f25f52302fbff..553eb4cfe85b69 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2338,8 +2338,8 @@ def test_possesive_repeat(self): ''') def test_repeat_index(self): - self.assertEqual(get_debug_out(r'(?:ab)*(?:cd)*'), '''\ -MAX_REPEAT 0 MAXREPEAT + self.assertEqual(get_debug_out(r'(?:ab)*?(?:cd)*'), '''\ +MIN_REPEAT 0 MAXREPEAT LITERAL 97 LITERAL 98 MAX_REPEAT 0 MAXREPEAT @@ -2350,7 +2350,7 @@ def test_repeat_index(self): 5: REPEAT 8 0 MAXREPEAT 0 (to 14) 10. LITERAL 0x61 ('a') 12. LITERAL 0x62 ('b') -14: MAX_UNTIL +14: MIN_UNTIL 15. REPEAT 8 0 MAXREPEAT 1 (to 24) 20. LITERAL 0x63 ('c') 22. LITERAL 0x64 ('d')