From 8f76a1f38223cce4cf64064fc1b67670e1f4b806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1niel=20B=C3=A1tyai?= Date: Tue, 26 May 2020 15:28:54 +0200 Subject: [PATCH] Rework RegExp engine and add support for proper unicode matching (#3746) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu --- jerry-core/api/jerry-snapshot.c | 7 +- jerry-core/ecma/base/ecma-gc.c | 2 +- jerry-core/ecma/base/ecma-helpers-string.c | 6 +- .../builtin-objects/ecma-builtin-global.c | 23 +- .../ecma/builtin-objects/ecma-builtin-json.c | 11 +- .../ecma/builtin-objects/ecma-builtins.c | 8 +- .../ecma/operations/ecma-regexp-object.c | 1470 +++++++++------- .../ecma/operations/ecma-regexp-object.h | 86 +- jerry-core/jcontext/jcontext.h | 2 +- jerry-core/lit/lit-char-helpers.c | 116 +- jerry-core/lit/lit-char-helpers.h | 12 +- jerry-core/lit/lit-strings.c | 6 +- jerry-core/parser/js/js-lexer.c | 12 +- jerry-core/parser/js/js-parser.c | 8 + jerry-core/parser/regexp/re-bytecode.c | 696 +++++--- jerry-core/parser/regexp/re-bytecode.h | 122 +- .../parser/regexp/re-compiler-context.h | 60 + jerry-core/parser/regexp/re-compiler.c | 899 +--------- jerry-core/parser/regexp/re-compiler.h | 23 +- jerry-core/parser/regexp/re-parser.c | 1543 ++++++++++++----- jerry-core/parser/regexp/re-parser.h | 70 +- jerry-core/parser/regexp/re-token.h | 72 + tests/jerry/es2015/regexp-unicode.js | 361 ++++ tests/jerry/regexp-alternatives.js | 3 + tests/jerry/regexp-backreference.js | 3 + tests/jerry/regexp-backtrack.js | 115 ++ tests/jerry/regexp-capture-groups.js | 9 + .../regexp-simple-atom-and-iterations.js | 3 + tests/jerry/regression-test-issue-2190.js | 2 +- tests/jerry/string-prototype-trim.js | 2 + 30 files changed, 3373 insertions(+), 2379 deletions(-) create mode 100644 jerry-core/parser/regexp/re-compiler-context.h create mode 100644 jerry-core/parser/regexp/re-token.h create mode 100644 tests/jerry/es2015/regexp-unicode.js create mode 100644 tests/jerry/regexp-backtrack.js diff --git a/jerry-core/api/jerry-snapshot.c b/jerry-core/api/jerry-snapshot.c index c25cfc5ccd..e3f77f1222 100644 --- a/jerry-core/api/jerry-snapshot.c +++ b/jerry-core/api/jerry-snapshot.c @@ -559,7 +559,6 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th #if ENABLED (JERRY_BUILTIN_REGEXP) if (!(bytecode_p->status_flags & CBC_CODE_FLAGS_FUNCTION)) { - const re_compiled_code_t *re_bytecode_p = NULL; const uint8_t *regex_start_p = ((const uint8_t *) bytecode_p) + sizeof (ecma_compiled_code_t); @@ -567,10 +566,8 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p, bytecode_p->refs); - re_compile_bytecode (&re_bytecode_p, - pattern_str_p, - bytecode_p->status_flags); - + const re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p, + bytecode_p->status_flags); ecma_deref_ecma_string (pattern_str_p); return (ecma_compiled_code_t *) re_bytecode_p; diff --git a/jerry-core/ecma/base/ecma-gc.c b/jerry-core/ecma/base/ecma-gc.c index d828598de5..ee2cffede2 100644 --- a/jerry-core/ecma/base/ecma-gc.c +++ b/jerry-core/ecma/base/ecma-gc.c @@ -1467,7 +1467,7 @@ ecma_gc_run (void) #if ENABLED (JERRY_BUILTIN_REGEXP) /* Free RegExp bytecodes stored in cache */ - re_cache_gc_run (); + re_cache_gc (); #endif /* ENABLED (JERRY_BUILTIN_REGEXP) */ } /* ecma_gc_run */ diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 3cd53da914..4c94038d20 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr { read_size = lit_read_code_unit_from_utf8 (current_p, &ch); - if (!lit_char_is_white_space (ch) - && !lit_char_is_line_terminator (ch)) + if (!lit_char_is_white_space (ch)) { nonws_start_p = current_p; break; @@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr { read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch); - if (!lit_char_is_white_space (ch) - && !lit_char_is_line_terminator (ch)) + if (!lit_char_is_white_space (ch)) { break; } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c index 0c00244a88..76f5de376a 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c @@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /* continue; } - ecma_char_t decoded_byte; - - if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte)) + uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2); + if (hex_value == UINT32_MAX) { return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value.")); } + ecma_char_t decoded_byte = (ecma_char_t) hex_value; input_char_p += URI_ENCODED_BYTE_SIZE; if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) @@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /* /* Input decode. */ if (*input_char_p != '%') { - *output_char_p = *input_char_p; - output_char_p++; - input_char_p++; + *output_char_p++ = *input_char_p++; continue; } - ecma_char_t decoded_byte; - - if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte)) + uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2); + if (hex_value == UINT32_MAX) { ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value.")); break; } + ecma_char_t decoded_byte = (ecma_char_t) hex_value; input_char_p += URI_ENCODED_BYTE_SIZE; if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) @@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /* } else { - ecma_char_t chr; + hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2); - if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr) - || ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)) + if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) { is_valid = false; break; } - octets[i] = (lit_utf8_byte_t) chr; input_char_p += URI_ENCODED_BYTE_SIZE; + octets[i] = (lit_utf8_byte_t) hex_value; } } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-json.c b/jerry-core/ecma/builtin-objects/ecma-builtin-json.c index 3c32991529..ea6c2f613c 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.c @@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument } case LIT_CHAR_LOWERCASE_U: { - if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH)) + uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH); + if (hex_value == UINT32_MAX) { goto invalid_string; } - ecma_char_t code_unit; - if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit))) - { - goto invalid_string; - } - - ecma_stringbuilder_append_char (&result_builder, code_unit); + ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value); current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1; break; } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtins.c b/jerry-core/ecma/builtin-objects/ecma-builtins.c index c173b5fe84..21b26cd4a2 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtins.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtins.c @@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */ ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL; - const re_compiled_code_t *bc_p = NULL; - ecma_value_t ret_value = re_compile_bytecode (&bc_p, - ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP), - RE_FLAG_EMPTY); + re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP), + RE_FLAG_EMPTY); - JERRY_ASSERT (ecma_is_value_empty (ret_value)); + JERRY_ASSERT (bc_p != NULL); ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p); diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index b2b0f27591..5d6ad06872 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -46,11 +46,6 @@ */ #define RE_GLOBAL_CAPTURE 0 -/** - * Check if a RegExp opcode is a capture group or not - */ -#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0) - /** * Parse RegExp flags (global, ignoreCase, multiline) * @@ -200,36 +195,6 @@ ecma_regexp_update_props (ecma_object_t *re_object_p, /**< RegExp object */ } /* ecma_regexp_update_props */ #endif /* !ENABLED (JERRY_ES2015) */ -#if ENABLED (JERRY_ES2015) -/** - * Helper function to get current code point and advance the string pointer. - * - * @return lit_code_point_t current code point - */ -static lit_code_point_t -ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */ - const lit_utf8_byte_t *end_p) /**< string end pointer */ -{ - JERRY_ASSERT (str_p != NULL); - const lit_utf8_byte_t *current_p = *str_p; - - lit_code_point_t ch = lit_cesu8_read_next (¤t_p); - if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch) - && current_p < end_p) - { - const ecma_char_t next_ch = lit_cesu8_peek_next (current_p); - if (lit_is_code_point_utf16_low_surrogate (next_ch)) - { - lit_utf8_incr (¤t_p); - ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); - } - } - - *str_p = current_p; - return ch; -} /* ecma_regexp_unicode_advance */ -#endif /* ENABLED (JERRY_ES2015) */ - /** * RegExpAlloc method * @@ -379,17 +344,14 @@ ecma_op_create_regexp_from_pattern (ecma_object_t *regexp_obj_p, /**< RegExp obj JERRY_ASSERT (ecma_is_value_empty (parse_flags_value)); } - const re_compiled_code_t *bc_p = NULL; - ecma_value_t ret_value = re_compile_bytecode (&bc_p, pattern_str_p, flags); + re_compiled_code_t *bc_p = re_compile_bytecode (pattern_str_p, flags); - if (ECMA_IS_VALUE_ERROR (ret_value)) + if (JERRY_UNLIKELY (bc_p == NULL)) { ecma_deref_ecma_string (pattern_str_p); - return ret_value; + return ECMA_VALUE_ERROR; } - JERRY_ASSERT (ecma_is_value_empty (ret_value)); - ecma_op_regexp_initialize (regexp_obj_p, bc_p, pattern_str_p, flags); ecma_deref_ecma_string (pattern_str_p); @@ -437,19 +399,14 @@ ecma_op_create_regexp_with_flags (ecma_object_t *regexp_obj_p, /**< RegExp objec return ECMA_VALUE_ERROR; } - const re_compiled_code_t *bc_p = NULL; - - ecma_value_t ret_value = re_compile_bytecode (&bc_p, pattern_str_p, flags); - + re_compiled_code_t *bc_p = re_compile_bytecode (pattern_str_p, flags); ecma_deref_ecma_string (pattern_str_p); - if (ECMA_IS_VALUE_ERROR (ret_value)) + if (JERRY_UNLIKELY (bc_p == NULL)) { - return ret_value; + return ECMA_VALUE_ERROR; } - JERRY_ASSERT (ecma_is_value_empty (ret_value)); - ecma_op_regexp_initialize (regexp_obj_p, bc_p, pattern_str_p, flags); return ecma_make_object_value (regexp_obj_p); @@ -461,7 +418,8 @@ ecma_op_create_regexp_with_flags (ecma_object_t *regexp_obj_p, /**< RegExp objec * @return ecma_char_t canonicalized character */ lit_code_point_t -ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */ +ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */ + bool unicode) /**< unicode */ { if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)) { @@ -484,21 +442,19 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */ ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH]; const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); - /* 3. */ if (size != 1) { return ch; } - /* 4. */ + const ecma_char_t cu = u[0]; - /* 5. */ - if (cu >= 128) + if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode) { /* 6. */ - return cu; + return ch; } - return ch; + return cu; } /* ecma_regexp_canonicalize_char */ /** @@ -508,31 +464,159 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */ * * @return ecma_char_t canonicalized character */ -inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE +static inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE ecma_regexp_canonicalize (lit_code_point_t ch, /**< character */ - bool is_ignorecase) /**< IgnoreCase flag */ + uint16_t flags) /**< flags */ { - if (is_ignorecase) + if (flags & RE_FLAG_IGNORE_CASE) { - return ecma_regexp_canonicalize_char (ch); + return ecma_regexp_canonicalize_char (ch, flags & RE_FLAG_UNICODE); } return ch; } /* ecma_regexp_canonicalize */ /** - * Recursive function for RegExp matching. + * Check if a code point is matched by a class escape. + * + * @return true, if code point matches escape + * false, otherwise + */ +static bool +ecma_regexp_check_class_escape (lit_code_point_t cp, /**< char */ + ecma_class_escape_t escape) /**< escape */ +{ + switch (escape) + { + case RE_ESCAPE_DIGIT: + { + return (cp >= LIT_CHAR_0 && cp <= LIT_CHAR_9); + } + case RE_ESCAPE_NOT_DIGIT: + { + return (cp < LIT_CHAR_0 || cp > LIT_CHAR_9); + } + case RE_ESCAPE_WORD_CHAR: + { + return lit_char_is_word_char (cp); + } + case RE_ESCAPE_NOT_WORD_CHAR: + { + return !lit_char_is_word_char (cp); + } + case RE_ESCAPE_WHITESPACE: + { + return lit_char_is_white_space ((ecma_char_t) cp); + } + case RE_ESCAPE_NOT_WHITESPACE: + { + return !lit_char_is_white_space ((ecma_char_t) cp); + } + default: + { + JERRY_UNREACHABLE (); + } + } +} /* ecma_regexp_check_class_escape */ + +/** + * Helper function to get current code point or code unit depending on execution mode, + * and advance the string pointer. + * + * @return lit_code_point_t current code point + */ +static lit_code_point_t +ecma_regexp_advance (ecma_regexp_ctx_t *re_ctx_p, /**< regexp context */ + const lit_utf8_byte_t **str_p) /**< reference to string pointer */ +{ + JERRY_ASSERT (str_p != NULL); + lit_code_point_t cp = lit_cesu8_read_next (str_p); + +#if ENABLED (JERRY_ES2015) + if (JERRY_UNLIKELY (re_ctx_p->flags & RE_FLAG_UNICODE) + && lit_is_code_point_utf16_high_surrogate ((ecma_char_t) cp) + && *str_p < re_ctx_p->input_end_p) + { + const ecma_char_t next_ch = lit_cesu8_peek_next (*str_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + cp = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) cp, next_ch); + *str_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; + } + } +#endif /* ENABLED (JERRY_ES2015) */ + + return ecma_regexp_canonicalize (cp, re_ctx_p->flags); +} /* ecma_regexp_advance */ + +#if ENABLED (JERRY_ES2015) +/** + * Helper function to get current full unicode code point and advance the string pointer. + * + * @return lit_code_point_t current code point + */ +lit_code_point_t +ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */ + const lit_utf8_byte_t *end_p) /**< string end pointer */ +{ + JERRY_ASSERT (str_p != NULL); + const lit_utf8_byte_t *current_p = *str_p; + + lit_code_point_t ch = lit_cesu8_read_next (¤t_p); + if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch) + && current_p < end_p) + { + const ecma_char_t next_ch = lit_cesu8_peek_next (current_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); + current_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; + } + } + + *str_p = current_p; + return ch; +} /* ecma_regexp_unicode_advance */ +#endif /* ENABLED (JERRY_ES2015) */ + +/** + * Helper function to revert the string pointer to the previous code point. + * + * @return pointer to previous code point + */ +static JERRY_ATTR_NOINLINE const lit_utf8_byte_t * +ecma_regexp_step_back (ecma_regexp_ctx_t *re_ctx_p, /**< regexp context */ + const lit_utf8_byte_t *str_p) /**< reference to string pointer */ +{ + JERRY_ASSERT (str_p != NULL); +#if ENABLED (JERRY_ES2015) + lit_code_point_t ch = lit_cesu8_read_prev (&str_p); + if (JERRY_UNLIKELY (re_ctx_p->flags & RE_FLAG_UNICODE) + && lit_is_code_point_utf16_low_surrogate (ch) + && lit_is_code_point_utf16_high_surrogate (lit_cesu8_peek_prev (str_p))) + { + str_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; + } +#else /* !ENABLED (JERRY_ES2015) */ + JERRY_UNUSED (re_ctx_p); + lit_utf8_decr (&str_p); +#endif /* !ENABLED (JERRY_ES2015) */ + return str_p; +} /* ecma_regexp_step_back */ + +/** + * Recursive function for executing RegExp bytecode. * * See also: * ECMA-262 v5, 15.10.2.1 * - * @return true - if matched - * false - otherwise + * @return pointer to the end of the currently matched substring + * NULL, if pattern did not match */ static const lit_utf8_byte_t * -ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ - const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */ - const lit_utf8_byte_t *str_curr_p) /**< input string pointer */ +ecma_regexp_run (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ + const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */ + const lit_utf8_byte_t *str_curr_p) /**< input string pointer */ { #if (JERRY_STACK_LIMIT != 0) if (JERRY_UNLIKELY (ecma_get_current_stack_usage () > CONFIG_MEM_STACK_LIMIT)) @@ -541,725 +625,950 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ } #endif /* JERRY_STACK_LIMIT != 0 */ + const lit_utf8_byte_t *str_start_p = str_curr_p; + const uint8_t *next_alternative_p = NULL; + while (true) { - re_opcode_t op = re_get_opcode (&bc_p); + const re_opcode_t op = re_get_opcode (&bc_p); switch (op) { - case RE_OP_MATCH: + case RE_OP_EOF: + { + re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p; + /* FALLTHRU */ + } + case RE_OP_ASSERT_END: + case RE_OP_ITERATOR_END: { - JERRY_TRACE_MSG ("Execute RE_OP_MATCH: match\n"); return str_curr_p; } - case RE_OP_CHAR: + case RE_OP_ALTERNATIVE_START: { - if (str_curr_p >= re_ctx_p->input_end_p) + const uint32_t offset = re_get_value (&bc_p); + next_alternative_p = bc_p + offset; + continue; + } + case RE_OP_ALTERNATIVE_NEXT: + { + while (true) { - return NULL; /* fail */ + const uint32_t offset = re_get_value (&bc_p); + bc_p += offset; + + if (*bc_p != RE_OP_ALTERNATIVE_NEXT) + { + break; + } + + bc_p++; } - const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; - lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */ - lit_code_point_t ch2 = lit_cesu8_read_next (&str_curr_p); + continue; + } + case RE_OP_NO_ALTERNATIVE: + { + return NULL; + } + case RE_OP_CAPTURING_GROUP_START: + { + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx; + group_p->subcapture_count = re_get_value (&bc_p); + + const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p; + const lit_utf8_byte_t *const saved_end_p = group_p->end_p; + const uint32_t saved_iterator = group_p->iterator; -#if ENABLED (JERRY_ES2015) - if (re_ctx_p->flags & RE_FLAG_UNICODE - && lit_is_code_point_utf16_high_surrogate (ch2) - && str_curr_p < re_ctx_p->input_end_p) + const uint32_t qmin = re_get_value (&bc_p); + group_p->end_p = NULL; + + /* If zero iterations are allowed, then execute the end opcode which will handle further iterations, + * otherwise run the 1st iteration immediately by executing group bytecode. */ + if (qmin == 0) { - const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p); - if (lit_is_code_point_utf16_low_surrogate (next_ch)) - { - lit_utf8_incr (&str_curr_p); - ch2 = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch2, next_ch); - } + group_p->iterator = 0; + group_p->begin_p = NULL; + const uint32_t end_offset = re_get_value (&bc_p); + group_p->bc_p = bc_p; + + bc_p += end_offset; + } + else + { + group_p->iterator = 1; + group_p->begin_p = str_curr_p; + group_p->bc_p = bc_p; } -#endif /* ENABLED (JERRY_ES2015) */ - ch2 = ecma_regexp_canonicalize (ch2, is_ignorecase); - JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2); + const lit_utf8_byte_t *matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + group_p->iterator = saved_iterator; - if (ch1 != ch2) + if (matched_p == NULL) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + group_p->begin_p = saved_begin_p; + group_p->end_p = saved_end_p; + goto fail; } - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + return matched_p; } - case RE_OP_PERIOD: + case RE_OP_NON_CAPTURING_GROUP_START: { - if (str_curr_p >= re_ctx_p->input_end_p) + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx; + + group_p->subcapture_start = re_get_value (&bc_p); + group_p->subcapture_count = re_get_value (&bc_p); + + const uint32_t saved_iterator = group_p->iterator; + const uint32_t qmin = re_get_value (&bc_p); + + /* If zero iterations are allowed, then execute the end opcode which will handle further iterations, + * otherwise run the 1st iteration immediately by executing group bytecode. */ + if (qmin == 0) { - return NULL; /* fail */ + group_p->iterator = 0; + group_p->begin_p = NULL; + const uint32_t end_offset = re_get_value (&bc_p); + group_p->bc_p = bc_p; + + bc_p += end_offset; + } + else + { + group_p->iterator = 1; + group_p->begin_p = str_curr_p; + group_p->bc_p = bc_p; } - const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p); - JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch); + const lit_utf8_byte_t *matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + group_p->iterator = saved_iterator; - if (lit_char_is_line_terminator (ch)) + if (matched_p == NULL) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + goto fail; } -#if ENABLED (JERRY_ES2015) - if (re_ctx_p->flags & RE_FLAG_UNICODE - && lit_is_code_point_utf16_high_surrogate (ch) - && str_curr_p < re_ctx_p->input_end_p) + return matched_p; + } + case RE_OP_GREEDY_CAPTURING_GROUP_END: + { + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx; + const uint32_t qmin = re_get_value (&bc_p); + + if (group_p->iterator < qmin) { - const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p); - if (lit_is_code_point_utf16_low_surrogate (next_ch)) + /* No need to save begin_p since we don't have to backtrack beyond the minimum iteration count, but we have + * to clear nested capturing groups. */ + group_p->begin_p = str_curr_p; + for (uint32_t i = 1; i < group_p->subcapture_count; ++i) + { + group_p[i].begin_p = NULL; + } + + group_p->iterator++; + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) { - lit_utf8_incr (&str_curr_p); + return matched_p; } + + group_p->iterator--; + goto fail; } -#endif /* ENABLED (JERRY_ES2015) */ - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ - } - case RE_OP_ASSERT_START: - { - JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_START: "); + /* Empty matches are not allowed after reaching the minimum number of iterations. */ + if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin)) + { + goto fail; + } - if (str_curr_p <= re_ctx_p->input_start_p) + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + if (JERRY_UNLIKELY (group_p->iterator >= qmax)) { - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + /* Reached maximum number of iterations, try to match tail bytecode. */ + group_p->end_p = str_curr_p; + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + goto fail; } - if (!(re_ctx_p->flags & RE_FLAG_MULTILINE)) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + /* Save and clear all nested capturing groups, and try to iterate. */ + JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count); + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + saved_captures_p[i] = group_p[i].begin_p; + group_p[i].begin_p = NULL; + } + + group_p->iterator++; + group_p->begin_p = str_curr_p; + + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + /* Failed to iterate again, backtrack to current match, and try to run tail bytecode. */ + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + group_p[i].begin_p = saved_captures_p[i]; + } + + group_p->iterator--; + group_p->end_p = str_curr_p; } - if (lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p))) + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (tail_match_p != NULL) { - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + return tail_match_p; } - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + goto fail; } - case RE_OP_ASSERT_END: + case RE_OP_GREEDY_NON_CAPTURING_GROUP_END: { - JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_END: "); + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx; + const uint32_t qmin = re_get_value (&bc_p); - if (str_curr_p >= re_ctx_p->input_end_p) + if (group_p->iterator < qmin) + { + /* No need to save begin_p but we have to clear nested capturing groups. */ + group_p->begin_p = str_curr_p; + + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start; + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + capture_p[i].begin_p = NULL; + } + + group_p->iterator++; + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + group_p->iterator--; + goto fail; + } + + /* Empty matches are not allowed after reaching the minimum number of iterations. */ + if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin)) { - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + goto fail; } - if (!(re_ctx_p->flags & RE_FLAG_MULTILINE)) + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + if (JERRY_UNLIKELY (group_p->iterator >= qmax)) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + /* Reached maximum number of iterations, try to match tail bytecode. */ + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + goto fail; } - if (lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p))) { - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + /* Save and clear all nested capturing groups, and try to iterate. */ + JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count); + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i; + saved_captures_p[i] = capture_p->begin_p; + capture_p->begin_p = NULL; + } + + group_p->iterator++; + const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p; + group_p->begin_p = str_curr_p; + + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + /* Failed to iterate again, backtrack to current match, and try to run tail bytecode. */ + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i; + capture_p->begin_p = saved_captures_p[i]; + } + + group_p->iterator--; + group_p->begin_p = saved_begin_p; } - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (tail_match_p != NULL) + { + return tail_match_p; + } + + goto fail; } - case RE_OP_ASSERT_WORD_BOUNDARY: - case RE_OP_ASSERT_NOT_WORD_BOUNDARY: + case RE_OP_LAZY_CAPTURING_GROUP_END: { - const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p) - && lit_char_is_word_char (lit_cesu8_peek_prev (str_curr_p))); + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx; + const uint32_t qmin = re_get_value (&bc_p); - const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p) - && lit_char_is_word_char (lit_cesu8_peek_next (str_curr_p))); - - if (op == RE_OP_ASSERT_WORD_BOUNDARY) + if (group_p->iterator < qmin) { - JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_WORD_BOUNDARY: "); - if (is_wordchar_left == is_wordchar_right) + /* No need to save begin_p but we have to clear nested capturing groups. */ + group_p->begin_p = str_curr_p; + for (uint32_t i = 1; i < group_p->subcapture_count; ++i) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + group_p[i].begin_p = NULL; } + + group_p->iterator++; + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + group_p->iterator--; + goto fail; } - else + + /* Empty matches are not allowed after reaching the minimum number of iterations. */ + if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin)) + { + goto fail; + } + + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + group_p->end_p = str_curr_p; + + /* Try to match tail bytecode. */ + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (tail_match_p != NULL) { - JERRY_ASSERT (op == RE_OP_ASSERT_NOT_WORD_BOUNDARY); - JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_NOT_WORD_BOUNDARY: "); + return tail_match_p; + } + + if (JERRY_UNLIKELY (group_p->iterator >= qmax)) + { + /* Reached maximum number of iterations and tail bytecode did not match. */ + goto fail; + } - if (is_wordchar_left != is_wordchar_right) + { + /* Save and clear all nested capturing groups, and try to iterate. */ + JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count); + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + saved_captures_p[i] = group_p[i].begin_p; + group_p[i].begin_p = NULL; } + + group_p->iterator++; + group_p->begin_p = str_curr_p; + + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + /* Backtrack to current match. */ + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + group_p[i].begin_p = saved_captures_p[i]; + } + + group_p->iterator--; } - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + goto fail; } - case RE_OP_LOOKAHEAD_POS: - case RE_OP_LOOKAHEAD_NEG: + case RE_OP_LAZY_NON_CAPTURING_GROUP_END: { - const lit_utf8_byte_t *matched_p = NULL; - const size_t captures_size = re_ctx_p->captures_count * sizeof (ecma_regexp_capture_t); - ecma_regexp_capture_t *saved_captures_p = (ecma_regexp_capture_t *) jmem_heap_alloc_block (captures_size); - memcpy (saved_captures_p, re_ctx_p->captures_p, captures_size); + const uint32_t group_idx = re_get_value (&bc_p); + ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx; + const uint32_t qmin = re_get_value (&bc_p); - do + if (group_p->iterator < qmin) { - const uint32_t offset = re_get_value (&bc_p); + /* Clear nested captures. */ + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start; + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + capture_p[i].begin_p = NULL; + } - if (matched_p == NULL) + group_p->iterator++; + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) { - matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + return matched_p; + } - if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) - { - jmem_heap_free_block (saved_captures_p, captures_size); - return matched_p; - } + group_p->iterator--; + goto fail; + } + + /* Empty matches are not allowed after reaching the minimum number of iterations. */ + if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin)) + { + goto fail; + } + + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + + /* Try to match tail bytecode. */ + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (tail_match_p != NULL) + { + return tail_match_p; + } + + if (JERRY_UNLIKELY (group_p->iterator >= qmax)) + { + /* Reached maximum number of iterations and tail bytecode did not match. */ + goto fail; + } + + { + /* Save and clear all nested capturing groups, and try to iterate. */ + JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count); + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i; + saved_captures_p[i] = capture_p->begin_p; + capture_p->begin_p = NULL; } - bc_p += offset; + + group_p->iterator++; + const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p; + group_p->begin_p = str_curr_p; + + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p); + + if (matched_p != NULL) + { + return matched_p; + } + + /* Backtrack to current match. */ + for (uint32_t i = 0; i < group_p->subcapture_count; ++i) + { + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i; + capture_p->begin_p = saved_captures_p[i]; + } + + group_p->iterator--; + group_p->begin_p = saved_begin_p; } - while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - JERRY_TRACE_MSG ("Execute RE_OP_LOOKAHEAD_POS/NEG: "); - if ((op == RE_OP_LOOKAHEAD_POS && matched_p != NULL) - || (op == RE_OP_LOOKAHEAD_NEG && matched_p == NULL)) + goto fail; + } + case RE_OP_GREEDY_ITERATOR: + { + const uint32_t qmin = re_get_value (&bc_p); + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + const uint32_t end_offset = re_get_value (&bc_p); + + uint32_t iterator = 0; + while (iterator < qmin) { - JERRY_TRACE_MSG ("match\n"); - matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + str_curr_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (str_curr_p == NULL) + { + goto fail; + } + + if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p)) + { + return str_curr_p; + } + + iterator++; } - else + + while (iterator < qmax) { - JERRY_TRACE_MSG ("fail\n"); - matched_p = NULL; /* fail */ + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (matched_p == NULL) + { + break; + } + + if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p)) + { + return str_curr_p; + } + + str_curr_p = matched_p; + iterator++; } - if (matched_p == NULL) + const uint8_t *const tail_bc_p = bc_p + end_offset; + while (true) { - /* restore saved */ - memcpy (re_ctx_p->captures_p, saved_captures_p, captures_size); + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, tail_bc_p, str_curr_p); + + if (tail_match_p != NULL) + { + return tail_match_p; + } + + if (JERRY_UNLIKELY (iterator <= qmin)) + { + goto fail; + } + + iterator--; + JERRY_ASSERT (str_curr_p > re_ctx_p->input_start_p); + str_curr_p = ecma_regexp_step_back (re_ctx_p, str_curr_p); } - jmem_heap_free_block (saved_captures_p, captures_size); - return matched_p; + JERRY_UNREACHABLE (); } - case RE_OP_CHAR_CLASS: - case RE_OP_INV_CHAR_CLASS: + case RE_OP_LAZY_ITERATOR: { - JERRY_TRACE_MSG ("Execute RE_OP_CHAR_CLASS/RE_OP_INV_CHAR_CLASS, "); - if (str_curr_p >= re_ctx_p->input_end_p) - { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ - } + const uint32_t qmin = re_get_value (&bc_p); + const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET; + const uint32_t end_offset = re_get_value (&bc_p); - uint32_t range_count = re_get_value (&bc_p); - const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; - bool is_match = false; - -#if ENABLED (JERRY_ES2015) - if (re_ctx_p->flags & RE_FLAG_UNICODE) + uint32_t iterator = 0; + while (iterator < qmin) { - lit_code_point_t curr_ch = ecma_regexp_unicode_advance (&str_curr_p, - re_ctx_p->input_end_p); - curr_ch = ecma_regexp_canonicalize (curr_ch, is_ignorecase); + str_curr_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); - while (range_count-- > 0) + if (str_curr_p == NULL) { - const lit_code_point_t ch1 = re_get_value (&bc_p); - if (curr_ch < ch1) - { - bc_p += sizeof (uint32_t); - continue; - } + goto fail; + } - const lit_code_point_t ch2 = re_get_value (&bc_p); - is_match = (curr_ch <= ch2); - if (is_match) - { - /* Skip the remaining ranges in the bytecode. */ - bc_p += range_count * 2 * sizeof (uint32_t); - break; - } + if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p)) + { + return str_curr_p; } + + iterator++; } - else + + const uint8_t *const tail_bc_p = bc_p + end_offset; + while (true) { -#endif /* ENABLED (JERRY_ES2015) */ - const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_cesu8_read_next (&str_curr_p), - is_ignorecase); + const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, tail_bc_p, str_curr_p); - while (range_count-- > 0) + if (tail_match_p != NULL) { - const ecma_char_t ch1 = re_get_char (&bc_p); - if (curr_ch < ch1) - { - bc_p += sizeof (ecma_char_t); - continue; - } + return tail_match_p; + } - const ecma_char_t ch2 = re_get_char (&bc_p); - is_match = (curr_ch <= ch2); - if (is_match) - { - /* Skip the remaining ranges in the bytecode. */ - bc_p += range_count * 2 * sizeof (ecma_char_t); - break; - } + if (JERRY_UNLIKELY (iterator >= qmax)) + { + goto fail; } -#if ENABLED (JERRY_ES2015) - } -#endif /* ENABLED (JERRY_ES2015) */ - JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS); + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); - if ((op == RE_OP_CHAR_CLASS) != is_match) - { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + if (matched_p == NULL) + { + goto fail; + } + + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) + { + return matched_p; + } + + iterator++; + str_curr_p = matched_p; } - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + JERRY_UNREACHABLE (); } case RE_OP_BACKREFERENCE: { const uint32_t backref_idx = re_get_value (&bc_p); - JERRY_TRACE_MSG ("Execute RE_OP_BACKREFERENCE (idx: %u): ", (unsigned int) backref_idx); JERRY_ASSERT (backref_idx >= 1 && backref_idx < re_ctx_p->captures_count); - const ecma_regexp_capture_t capture = re_ctx_p->captures_p[backref_idx]; + const ecma_regexp_capture_t *capture_p = re_ctx_p->captures_p + backref_idx; - if (capture.begin_p == NULL || capture.end_p == NULL) + if (!ECMA_RE_IS_CAPTURE_DEFINED (capture_p) || capture_p->end_p <= capture_p->begin_p) { - JERRY_TRACE_MSG ("match\n"); - break; /* capture is 'undefined', always matches! */ + /* Undefined or zero length captures always match. */ + continue; } - const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p); - - if (str_curr_p + capture_size > re_ctx_p->input_end_p) - { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ - } + const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture_p->end_p - capture_p->begin_p); - if (memcmp (str_curr_p, capture.begin_p, capture_size)) + if (str_curr_p + capture_size > re_ctx_p->input_end_p + || memcmp (str_curr_p, capture_p->begin_p, capture_size)) { - JERRY_TRACE_MSG ("fail\n"); - return NULL; /* fail */ + goto fail; } str_curr_p += capture_size; - JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ + continue; } - case RE_OP_SAVE_AT_START: + case RE_OP_ASSERT_LINE_START: { - JERRY_TRACE_MSG ("Execute RE_OP_SAVE_AT_START\n"); - re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p; - - do + if (str_curr_p <= re_ctx_p->input_start_p) { - const uint32_t offset = re_get_value (&bc_p); - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (matched_p != NULL) - { - return matched_p; /* match */ - } + continue; + } - bc_p += offset; + if (!(re_ctx_p->flags & RE_FLAG_MULTILINE) || !lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p))) + { + goto fail; } - while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - bc_p -= sizeof (uint8_t); - return NULL; /* fail */ - } - case RE_OP_SAVE_AND_MATCH: - { - JERRY_TRACE_MSG ("End of pattern is reached: match\n"); - re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p; - return str_curr_p; /* match */ + continue; } - case RE_OP_ALTERNATIVE: + case RE_OP_ASSERT_LINE_END: { - /* - * Alternatives should be jumped over, when an alternative opcode appears. - */ - uint32_t offset = re_get_value (&bc_p); - JERRY_TRACE_MSG ("Execute RE_OP_ALTERNATIVE"); - bc_p += offset; + if (str_curr_p >= re_ctx_p->input_end_p) + { + continue; + } - while (*bc_p == RE_OP_ALTERNATIVE) + if (!(re_ctx_p->flags & RE_FLAG_MULTILINE) || !lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p))) { - JERRY_TRACE_MSG (", jump: %u", (unsigned int) offset); - bc_p++; - offset = re_get_value (&bc_p); - bc_p += offset; + goto fail; } - JERRY_TRACE_MSG ("\n"); - break; /* tail merge */ + continue; } - case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START: - case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START: + case RE_OP_ASSERT_WORD_BOUNDARY: { - /* - * On non-greedy iterations we have to execute the bytecode - * after the group first, if zero iteration is allowed. - */ - const lit_utf8_byte_t *old_begin_p = NULL; - const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group start */ - const uint32_t start_idx = re_get_value (&bc_p); - const uint32_t offset = re_get_value (&bc_p); + const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p) + && lit_char_is_word_char (str_curr_p[-1])); - uint32_t *iterator_p; - if (RE_IS_CAPTURE_GROUP (op)) + const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p) + && lit_char_is_word_char (str_curr_p[0])); + if (is_wordchar_right == is_wordchar_left) { - JERRY_ASSERT (start_idx < re_ctx_p->captures_count); - re_ctx_p->captures_p[start_idx].begin_p = str_curr_p; - iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]); + goto fail; } - else + + continue; + } + case RE_OP_ASSERT_NOT_WORD_BOUNDARY: + { + const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p) + && lit_char_is_word_char (str_curr_p[-1])); + + const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p) + && lit_char_is_word_char (str_curr_p[0])); + if (is_wordchar_right != is_wordchar_left) { - JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count); - iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]); + goto fail; } - *iterator_p = 0; - - /* Jump all over to the end of the END opcode. */ - bc_p += offset; - /* Try to match after the close paren if zero is allowed */ - const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + continue; + } + case RE_OP_ASSERT_LOOKAHEAD_POS: + { + const uint8_t qmin = re_get_byte (&bc_p); + const uint32_t capture_start = re_get_value (&bc_p); + const uint32_t capture_count = re_get_value (&bc_p); + const uint32_t end_offset = re_get_value (&bc_p); - if (matched_p != NULL) + /* If qmin is zero, the assertion implicitly matches. */ + if (qmin == 0) { - return str_curr_p; /* match */ + bc_p += end_offset; + continue; } - if (RE_IS_CAPTURE_GROUP (op)) + /* Capture end pointers might get clobbered and need to be restored after a tail match fail. */ + JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, capture_count); + for (uint32_t i = 0; i < capture_count; ++i) { - re_ctx_p->captures_p[start_idx].begin_p = old_begin_p; + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + capture_start + i; + saved_captures_p[i] = capture_p->end_p; } - bc_p = bc_start_p; - /* FALLTHRU */ - } - case RE_OP_CAPTURE_GROUP_START: - case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START: - case RE_OP_NON_CAPTURE_GROUP_START: - case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START: - { - const uint8_t *bc_end_p = NULL; - const uint32_t start_idx = re_get_value (&bc_p); + /* The first iteration will decide whether the assertion matches depending on whether + * the iteration matched or not. */ + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); - if (op != RE_OP_CAPTURE_GROUP_START - && op != RE_OP_NON_CAPTURE_GROUP_START) + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) { - const uint32_t offset = re_get_value (&bc_p); - bc_end_p = bc_p + offset; + return matched_p; } - const lit_utf8_byte_t **group_begin_p; - uint32_t *iterator_p; - if (RE_IS_CAPTURE_GROUP (op)) - { - JERRY_ASSERT (start_idx < re_ctx_p->captures_count); - group_begin_p = &(re_ctx_p->captures_p[start_idx].begin_p); - iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]); - } - else + if (matched_p == NULL) { - JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count); - group_begin_p = &(re_ctx_p->non_captures_p[start_idx].str_p); - iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]); + goto fail; } - const lit_utf8_byte_t *const old_begin_p = *group_begin_p; - const uint32_t old_iter_count = *iterator_p; - *group_begin_p = str_curr_p; - *iterator_p = 0; + const lit_utf8_byte_t *tail_match_p = ecma_regexp_run (re_ctx_p, bc_p + end_offset, str_curr_p); - do + if (tail_match_p == NULL) { - const uint32_t offset = re_get_value (&bc_p); - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (matched_p != NULL) + for (uint32_t i = 0; i < capture_count; ++i) { - return matched_p; /* match */ + ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + capture_start + i; + capture_p->begin_p = NULL; + capture_p->end_p = saved_captures_p[i]; } - bc_p += offset; + goto fail; } - while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - bc_p -= sizeof (uint8_t); - *iterator_p = old_iter_count; + return tail_match_p; + } + case RE_OP_ASSERT_LOOKAHEAD_NEG: + { + const uint8_t qmin = re_get_byte (&bc_p); + uint32_t capture_idx = re_get_value (&bc_p); + const uint32_t capture_count = re_get_value (&bc_p); + const uint32_t end_offset = re_get_value (&bc_p); - /* Try to match after the close paren if zero is allowed. */ - if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START - || op == RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START) + /* If qmin is zero, the assertion implicitly matches. */ + if (qmin > 0) { - JERRY_ASSERT (bc_end_p); - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_end_p, str_curr_p); + /* The first iteration will decide whether the assertion matches depending on whether + * the iteration matched or not. */ + const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); + + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) + { + return matched_p; + } if (matched_p != NULL) { - return matched_p; /* match */ + /* Nested capturing groups inside a negative lookahead can never capture, so we clear their results. */ + const uint32_t capture_end = capture_idx + capture_count; + while (capture_idx < capture_end) + { + re_ctx_p->captures_p[capture_idx++].begin_p = NULL; + } + + goto fail; } } - *group_begin_p = old_begin_p; - return NULL; /* fail */ + bc_p += end_offset; + continue; } - case RE_OP_CAPTURE_NON_GREEDY_GROUP_END: - case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END: + case RE_OP_CLASS_ESCAPE: { - /* - * On non-greedy iterations we have to execute the bytecode - * after the group first. Try to iterate only if it fails. - */ - const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group end */ - const uint32_t end_idx = re_get_value (&bc_p); - const uint32_t min = re_get_value (&bc_p); - const uint32_t max = re_get_value (&bc_p); - re_get_value (&bc_p); /* start offset */ - - const lit_utf8_byte_t **group_end_p; - uint32_t *iterator_p; - if (RE_IS_CAPTURE_GROUP (op)) - { - JERRY_ASSERT (end_idx < re_ctx_p->captures_count); - group_end_p = &(re_ctx_p->captures_p[end_idx].end_p); - iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]); - } - else + if (str_curr_p >= re_ctx_p->input_end_p) { - JERRY_ASSERT (end_idx < re_ctx_p->non_captures_count); - group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p); - iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]); + goto fail; } - (*iterator_p)++; + const lit_code_point_t cp = ecma_regexp_advance (re_ctx_p, &str_curr_p); - if (*iterator_p >= min && *iterator_p <= max) + const ecma_class_escape_t escape = (ecma_class_escape_t) re_get_byte (&bc_p); + if (!ecma_regexp_check_class_escape (cp, escape)) { - const lit_utf8_byte_t *const old_end_p = *group_end_p; - *group_end_p = str_curr_p; - - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (matched_p != NULL) - { - return matched_p; /* match */ - } - - *group_end_p = old_end_p; + goto fail; } - (*iterator_p)--; - bc_p = bc_start_p; - /* Non-greedy fails, try to iterate. */ - /* FALLTHRU */ + continue; } - case RE_OP_CAPTURE_GREEDY_GROUP_END: - case RE_OP_NON_CAPTURE_GREEDY_GROUP_END: + case RE_OP_CHAR_CLASS: { - const uint32_t end_idx = re_get_value (&bc_p); - const uint32_t min = re_get_value (&bc_p); - const uint32_t max = re_get_value (&bc_p); - uint32_t offset = re_get_value (&bc_p); - - const lit_utf8_byte_t **group_begin_p; - const lit_utf8_byte_t **group_end_p; - uint32_t *iterator_p; - - if (RE_IS_CAPTURE_GROUP (op)) + if (str_curr_p >= re_ctx_p->input_end_p) { - JERRY_ASSERT (end_idx < re_ctx_p->captures_count); - group_begin_p = &(re_ctx_p->captures_p[end_idx].begin_p); - group_end_p = &(re_ctx_p->captures_p[end_idx].end_p); - iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]); + goto fail; } - else + + uint8_t flags = re_get_byte (&bc_p); + uint32_t char_count = (flags & RE_CLASS_HAS_CHARS) ? re_get_value (&bc_p) : 0; + uint32_t range_count = (flags & RE_CLASS_HAS_RANGES) ? re_get_value (&bc_p) : 0; + + const lit_code_point_t cp = ecma_regexp_advance (re_ctx_p, &str_curr_p); + + uint8_t escape_count = flags & RE_CLASS_ESCAPE_COUNT_MASK; + while (escape_count > 0) { - JERRY_ASSERT (end_idx <= re_ctx_p->non_captures_count); - group_begin_p = &(re_ctx_p->non_captures_p[end_idx].str_p); - group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p); - iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]); + escape_count--; + const ecma_class_escape_t escape = re_get_byte (&bc_p); + if (ecma_regexp_check_class_escape (cp, escape)) + { + goto class_found; + } } - /* Check the empty iteration if the minimum number of iterations is reached. */ - if (*iterator_p >= min && str_curr_p == *group_begin_p) + while (char_count > 0) { - return NULL; /* fail */ + char_count--; + const lit_code_point_t curr = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE); + if (cp == curr) + { + goto class_found; + } } - (*iterator_p)++; - - const uint8_t *const bc_start_p = bc_p; /* Save the bytecode end position of the END opcodes. */ - const lit_utf8_byte_t *const old_end_p = *group_end_p; - *group_end_p = str_curr_p; - - if (*iterator_p < max) + while (range_count > 0) { - bc_p -= offset; - offset = re_get_value (&bc_p); - - const lit_utf8_byte_t *const old_begin_p = *group_begin_p; - *group_begin_p = str_curr_p; + range_count--; + const lit_code_point_t begin = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE); - const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (matched_p != NULL) + if (cp < begin) { - return matched_p; /* match */ + bc_p += re_ctx_p->char_size; + continue; } - /* Try to match alternatives if any. */ - bc_p += offset; - while (*bc_p == RE_OP_ALTERNATIVE) + const lit_code_point_t end = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE); + if (cp <= end) { - bc_p++; /* RE_OP_ALTERNATIVE */ - offset = re_get_value (&bc_p); - - *group_begin_p = str_curr_p; - - matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (matched_p != NULL) - { - return matched_p; /* match */ - } - - bc_p += offset; + goto class_found; } - - *group_begin_p = old_begin_p; } - if (*iterator_p >= min && *iterator_p <= max) + /* Not found */ + if (flags & RE_CLASS_INVERT) { - /* Try to match the rest of the bytecode. */ - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_start_p, str_curr_p); + continue; + } - if (matched_p != NULL) - { - return matched_p; /* match */ - } + goto fail; + +class_found: + if (flags & RE_CLASS_INVERT) + { + goto fail; } - /* restore if fails */ - *group_end_p = old_end_p; - (*iterator_p)--; - return NULL; /* fail */ + const uint32_t chars_size = char_count * re_ctx_p->char_size; + const uint32_t ranges_size = range_count * re_ctx_p->char_size * 2; + bc_p = bc_p + escape_count + chars_size + ranges_size; + continue; } - case RE_OP_NON_GREEDY_ITERATOR: +#if ENABLED (JERRY_ES2015) + case RE_OP_UNICODE_PERIOD: { - const uint32_t min = re_get_value (&bc_p); - const uint32_t max = re_get_value (&bc_p); - - const uint32_t offset = re_get_value (&bc_p); - JERRY_TRACE_MSG ("Non-greedy iterator, min=%lu, max=%lu, offset=%ld\n", - (unsigned long) min, (unsigned long) max, (long) offset); - - uint32_t iter_count = 0; - while (iter_count <= max) + if (str_curr_p >= re_ctx_p->input_end_p) { - if (iter_count >= min) - { - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p); + goto fail; + } - if (matched_p != NULL) - { - return matched_p; /* match */ - } - } + const lit_code_point_t cp = ecma_regexp_unicode_advance (&str_curr_p, re_ctx_p->input_end_p); - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + if (JERRY_UNLIKELY (cp <= LIT_UTF16_CODE_UNIT_MAX && lit_char_is_line_terminator ((ecma_char_t) cp))) + { + goto fail; + } - if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) - { - return matched_p; - } + continue; + } +#endif /* ENABLED (JERRY_ES2015) */ + case RE_OP_PERIOD: + { + if (str_curr_p >= re_ctx_p->input_end_p) + { + goto fail; + } - if (matched_p == NULL) - { - break; - } + const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p); - str_curr_p = matched_p; - iter_count++; + if (lit_char_is_line_terminator (ch)) + { + goto fail; } - return NULL; /* fail */ + continue; } - default: + case RE_OP_CHAR: { - JERRY_ASSERT (op == RE_OP_GREEDY_ITERATOR); - - const uint32_t min = re_get_value (&bc_p); - const uint32_t max = re_get_value (&bc_p); + if (str_curr_p >= re_ctx_p->input_end_p) + { + goto fail; + } - const uint32_t offset = re_get_value (&bc_p); - JERRY_TRACE_MSG ("Greedy iterator, min=%lu, max=%lu, offset=%ld\n", - (unsigned long) min, (unsigned long) max, (long) offset); + const lit_code_point_t ch1 = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE); + const lit_code_point_t ch2 = ecma_regexp_advance (re_ctx_p, &str_curr_p); - uint32_t iter_count = 0; - while (iter_count < max) + if (ch1 != ch2) { - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - - if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) - { - return matched_p; - } + goto fail; + } - if (matched_p == NULL) - { - break; - } + continue; + } + default: + { + JERRY_ASSERT (op == RE_OP_BYTE); - str_curr_p = matched_p; - iter_count++; + if (str_curr_p >= re_ctx_p->input_end_p + || *bc_p++ != *str_curr_p++) + { + goto fail; } - if (iter_count >= min) - { - while (true) - { - const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p); + continue; + } + } - if (matched_p != NULL) - { - return matched_p; /* match */ - } + JERRY_UNREACHABLE (); +fail: + bc_p = next_alternative_p; - if (iter_count == min) - { - break; - } + if (bc_p == NULL || *bc_p++ != RE_OP_ALTERNATIVE_NEXT) + { + /* None of the alternatives matched. */ + return NULL; + } - lit_cesu8_read_prev (&str_curr_p); - iter_count--; - } - } + /* Get the end of the new alternative and continue execution. */ + str_curr_p = str_start_p; + const uint32_t offset = re_get_value (&bc_p); + next_alternative_p = bc_p + offset; + } +} /* ecma_regexp_run */ - return NULL; /* fail */ - } - } +/** + * Match a RegExp at a specific position in the input string. + * + * @return pointer to the end of the matched sub-string + * NULL, if pattern did not match + */ +static const lit_utf8_byte_t * +ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ + const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */ + const lit_utf8_byte_t *str_curr_p) /**< input string pointer */ +{ + re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p; + + for (uint32_t i = 1; i < re_ctx_p->captures_count; ++i) + { + re_ctx_p->captures_p[i].begin_p = NULL; } + + return ecma_regexp_run (re_ctx_p, bc_p, str_curr_p); } /* ecma_regexp_match */ /* @@ -1273,6 +1582,7 @@ ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p) /** { if (ECMA_RE_IS_CAPTURE_DEFINED (capture_p)) { + JERRY_ASSERT (capture_p->end_p >= capture_p->begin_p); const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture_p->end_p - capture_p->begin_p); ecma_string_t *const capture_str_p = ecma_new_ecma_string_from_utf8 (capture_p->begin_p, capture_size); return ecma_make_string_value (capture_str_p); @@ -1331,20 +1641,21 @@ ecma_regexp_initialize_context (ecma_regexp_ctx_t *ctx_p, /**< regexp context */ JERRY_ASSERT (input_start_p != NULL); JERRY_ASSERT (input_end_p >= input_start_p); + ctx_p->flags = bc_p->header.status_flags; + ctx_p->char_size = (ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t); + ctx_p->input_start_p = input_start_p; ctx_p->input_end_p = input_end_p; ctx_p->captures_count = bc_p->captures_count; - ctx_p->captures_p = jmem_heap_alloc_block (ctx_p->captures_count * sizeof (ecma_regexp_capture_t)); - memset (ctx_p->captures_p, 0, ctx_p->captures_count * sizeof (ecma_regexp_capture_t)); - ctx_p->non_captures_count = bc_p->non_captures_count; - ctx_p->non_captures_p = jmem_heap_alloc_block (ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t)); - memset (ctx_p->non_captures_p, 0, ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t)); - const uint32_t iters_length = ctx_p->captures_count + ctx_p->non_captures_count - 1; - ctx_p->iterations_p = jmem_heap_alloc_block (iters_length * sizeof (uint32_t)); - memset (ctx_p->iterations_p, 0, iters_length * sizeof (uint32_t)); + ctx_p->captures_p = jmem_heap_alloc_block (ctx_p->captures_count * sizeof (ecma_regexp_capture_t)); + + if (ctx_p->non_captures_count > 0) + { + ctx_p->non_captures_p = jmem_heap_alloc_block (ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t)); + } } /* ecma_regexp_initialize_context */ /** @@ -1355,15 +1666,11 @@ ecma_regexp_cleanup_context (ecma_regexp_ctx_t *ctx_p) /**< regexp context */ { JERRY_ASSERT (ctx_p != NULL); jmem_heap_free_block (ctx_p->captures_p, ctx_p->captures_count * sizeof (ecma_regexp_capture_t)); - if (ctx_p->non_captures_p != NULL) + + if (ctx_p->non_captures_count > 0) { jmem_heap_free_block (ctx_p->non_captures_p, ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t)); } - if (ctx_p->iterations_p != NULL) - { - const uint32_t iters_length = ctx_p->captures_count + ctx_p->non_captures_count - 1; - jmem_heap_free_block (ctx_p->iterations_p, iters_length * sizeof (uint32_t)); - } } /* ecma_regexp_cleanup_context */ /** @@ -1391,8 +1698,6 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */ re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, ext_object_p->u.class_prop.u.value); - ecma_regexp_ctx_t re_ctx; - re_ctx.flags = bc_p->header.status_flags; lit_utf8_size_t input_size; lit_utf8_size_t input_length; uint8_t input_flags = ECMA_STRING_FLAG_IS_ASCII; @@ -1404,7 +1709,7 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */ const lit_utf8_byte_t *input_curr_p = input_buffer_p; uint32_t index = 0; - if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY)) + if (bc_p->header.status_flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY)) { ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p); @@ -1464,6 +1769,7 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */ } const lit_utf8_byte_t *input_end_p = input_buffer_p + input_size; + ecma_regexp_ctx_t re_ctx; ecma_regexp_initialize_context (&re_ctx, bc_p, input_buffer_p, @@ -1473,8 +1779,6 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */ uint8_t *bc_start_p = (uint8_t *) (bc_p + 1); const lit_utf8_byte_t *matched_p = NULL; - JERRY_TRACE_MSG ("Exec with flags [%x]\n", re_ctx.flags); - JERRY_ASSERT (index <= input_length); while (true) { @@ -2077,7 +2381,6 @@ ecma_regexp_split_helper (ecma_value_t this_arg, /**< this value */ const lit_utf8_byte_t *const string_end_p = string_buffer_p + string_size; ecma_regexp_ctx_t re_ctx; - re_ctx.flags = bc_p->header.status_flags; ecma_regexp_initialize_context (&re_ctx, bc_p, string_buffer_p, @@ -2112,7 +2415,6 @@ ecma_regexp_split_helper (ecma_value_t this_arg, /**< this value */ while (current_str_p < string_end_p) { /* 13.a. */ - memset (re_ctx.captures_p, 0, re_ctx.captures_count); const lit_utf8_byte_t *const matched_p = ecma_regexp_match (&re_ctx, bc_start_p, current_str_p); if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) @@ -2223,8 +2525,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**header.status_flags; uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII; lit_utf8_size_t string_length; @@ -2260,6 +2560,7 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**string_p, @@ -2271,7 +2572,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**begin_p != NULL && (c)->end_p >= (c)->begin_p) +#define RE_CLASS_ESCAPE_COUNT_MASK_SIZE (3u) -ecma_value_t -ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p); +/** + * Character class flags escape count mask. + */ +#define RE_CLASS_ESCAPE_COUNT_MASK ((1 << RE_CLASS_ESCAPE_COUNT_MASK_SIZE) - 1u) /** - * Structure for storing non-capturing group results + * Character class flags that are present in the upper bits of the class flags byte, while the 3 least significant bits + * hold a value that contains the number of class escapes present in the character class. + */ +typedef enum +{ + RE_CLASS_HAS_CHARS = (1 << 5), /**< contains individual characters */ + RE_CLASS_HAS_RANGES = (1 << 6), /**< contains character ranges */ + RE_CLASS_INVERT = (1 << 7), /**< inverted */ +} ecma_char_class_flags_t; + +/** + * Structure for matching capturing groups and storing their result */ typedef struct { - const lit_utf8_byte_t *str_p; /**< string pointer */ + const lit_utf8_byte_t *begin_p; /**< capture start pointer */ + const lit_utf8_byte_t *end_p; /**< capture end pointer */ + const uint8_t *bc_p; /**< group bytecode pointer */ + uint32_t iterator; /**< iteration counter */ + uint32_t subcapture_count; /**< number of nested capturing groups */ +} ecma_regexp_capture_t; + +/** + * Structure for matching non-capturing groups + */ +typedef struct +{ + const lit_utf8_byte_t *begin_p; /**< substring start pointer */ + const uint8_t *bc_p; /**< group bytecode pointer */ + uint32_t iterator; /**< iteration counter */ + uint32_t subcapture_start; /**< first nested capturing group index */ + uint32_t subcapture_count; /**< number of nested capturing groups */ } ecma_regexp_non_capture_t; +/** + * Check if an ecma_regexp_capture_t contains a defined capture + */ +#define ECMA_RE_IS_CAPTURE_DEFINED(c) ((c)->begin_p != NULL) + +ecma_value_t +ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p); + #if (JERRY_STACK_LIMIT != 0) /** * Value used ase result when stack limit is reached @@ -82,27 +125,38 @@ typedef struct #define ECMA_RE_STACK_LIMIT_REACHED(p) (false) #endif /* JERRY_STACK_LIMIT != 0 */ +/** + * Offset applied to qmax when encoded into the bytecode. + * + * It's common for qmax to be Infinity, which is represented a UINT32_MAX. By applying the offset we are able to store + * it in a single byte az zero. + */ +#define RE_QMAX_OFFSET 1 + /** * RegExp executor context */ typedef struct { - const lit_utf8_byte_t *input_end_p; /**< end of input string */ const lit_utf8_byte_t *input_start_p; /**< start of input string */ + const lit_utf8_byte_t *input_end_p; /**< end of input string */ uint32_t captures_count; /**< number of capture groups */ - ecma_regexp_capture_t *captures_p; /**< capturing groups */ uint32_t non_captures_count; /**< number of non-capture groups */ + ecma_regexp_capture_t *captures_p; /**< capturing groups */ ecma_regexp_non_capture_t *non_captures_p; /**< non-capturing groups */ - uint32_t *iterations_p; /**< number of iterations */ uint16_t flags; /**< RegExp flags */ + uint8_t char_size; /**< size of encoded characters */ } ecma_regexp_ctx_t; +#if ENABLED (JERRY_ES2015) +lit_code_point_t ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, const lit_utf8_byte_t *end_p); +#endif /* ENABLED (JERRY_ES2015) */ + ecma_object_t *ecma_op_regexp_alloc (ecma_object_t *new_target_obj_p); ecma_value_t ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, ecma_string_t *input_string_p); ecma_string_t *ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg); -lit_code_point_t ecma_regexp_canonicalize (lit_code_point_t ch, bool is_ignorecase); -lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch); +lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch, bool unicode); ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p); void ecma_regexp_create_and_initialize_props (ecma_object_t *re_object_p, ecma_string_t *source_p, diff --git a/jerry-core/jcontext/jcontext.h b/jerry-core/jcontext/jcontext.h index 698450f488..fd655651cc 100644 --- a/jerry-core/jcontext/jcontext.h +++ b/jerry-core/jcontext/jcontext.h @@ -127,7 +127,7 @@ struct jerry_context_t /* Update JERRY_CONTEXT_FIRST_MEMBER if the first non-external member changes */ jmem_cpointer_t ecma_builtin_objects[ECMA_BUILTIN_ID__COUNT]; /**< pointer to instances of built-in objects */ #if ENABLED (JERRY_BUILTIN_REGEXP) - const re_compiled_code_t *re_cache[RE_CACHE_SIZE]; /**< regex cache */ + re_compiled_code_t *re_cache[RE_CACHE_SIZE]; /**< regex cache */ #endif /* ENABLED (JERRY_BUILTIN_REGEXP) */ jmem_cpointer_t ecma_gc_objects_cp; /**< List of currently alive objects. */ jmem_heap_free_t *jmem_heap_list_skip_p; /**< This is used to speed up deallocation. */ diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index 74c235cbc8..90606d323e 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -103,31 +103,32 @@ search_char_in_interval_array (ecma_char_t c, /**< code unit */ } /* search_char_in_interval_array */ /** - * Check if specified character is one of the Whitespace characters including those - * that fall into "Space, Separator" ("Zs") Unicode character category. + * Check if specified character is one of the Whitespace characters including those that fall into + * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters. * * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2, * false - otherwise */ bool -lit_char_is_white_space (ecma_char_t c) /**< code unit */ +lit_char_is_white_space (lit_code_point_t c) /**< code point */ { if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) { - return (c == LIT_CHAR_TAB - || c == LIT_CHAR_VTAB - || c == LIT_CHAR_FF - || c == LIT_CHAR_SP); + return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR)); } else { - return (c == LIT_CHAR_NBSP - || c == LIT_CHAR_BOM - || (c >= lit_unicode_separator_char_interval_sps[0] - && c <= lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0]) - || search_char_in_char_array (c, - lit_unicode_separator_chars, - NUM_OF_ELEMENTS (lit_unicode_separator_chars))); + if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS) + { + return true; + } + + return (c <= LIT_UTF16_CODE_UNIT_MAX + && ((c >= lit_unicode_separator_char_interval_sps[0] + && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0]) + || search_char_in_char_array ((ecma_char_t) c, + lit_unicode_separator_chars, + NUM_OF_ELEMENTS (lit_unicode_separator_chars)))); } } /* lit_char_is_white_space */ @@ -429,51 +430,72 @@ lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */ } /* lit_four_byte_utf8_char_to_cesu8 */ /** - * Parse the next number_of_characters hexadecimal character, - * and construct a code unit from them. The buffer must - * be zero terminated. + * Lookup hex digits in a buffer * - * @return true if decoding was successful, false otherwise + * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number + * value of hex number, otherwise */ -bool -lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ - lit_utf8_size_t number_of_characters, /**< number of characters to be read */ - ecma_char_t *out_code_unit_p) /**< [out] decoded result */ +uint32_t +lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */ + const lit_utf8_byte_t *const buf_end_p, /**< buffer end */ + uint32_t lookup) /**< size of lookup */ { - ecma_char_t code_unit = LIT_CHAR_NULL; + JERRY_ASSERT (lookup <= 4); - JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4); - - for (lit_utf8_size_t i = 0; i < number_of_characters; i++) + if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p)) { - code_unit = (ecma_char_t) (code_unit << 4u); + return UINT32_MAX; + } - if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN - && *buf_p <= LIT_CHAR_ASCII_DIGITS_END) - { - code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN); - } - else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END) - { - code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10)); - } - else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END) + uint32_t value = 0; + + while (lookup--) + { + lit_utf8_byte_t ch = *buf_p++; + if (!lit_char_is_hex_digit (ch)) { - code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10)); + return UINT32_MAX; } - else + + value <<= 4; + value += lit_char_hex_to_int (ch); + } + + JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX); + return value; +} /* lit_char_hex_lookup */ + +/** + * Parse a decimal number with the value clamped to UINT32_MAX. + * + * @returns uint32_t number + */ +uint32_t +lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */ + const lit_utf8_byte_t *buffer_end_p) /**< buffer end */ +{ + const lit_utf8_byte_t *current_p = *buffer_p; + JERRY_ASSERT (lit_char_is_decimal_digit (*current_p)); + + uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0); + + while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p)) + { + const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0); + uint32_t new_value = value * 10 + digit; + + if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value)) { - return false; + value = UINT32_MAX; + continue; } - buf_p++; + value = new_value; } - *out_code_unit_p = code_unit; - return true; -} /* lit_read_code_unit_from_hex */ + *buffer_p = current_p; + return value; +} /* lit_parse_decimal */ /** * Check if specified character is a word character (part of IsWordChar abstract operation) @@ -484,7 +506,7 @@ lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with char * false - otherwise */ bool -lit_char_is_word_char (ecma_char_t c) /**< code unit */ +lit_char_is_word_char (lit_code_point_t c) /**< code point */ { return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END) || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index e6dbe6c585..3ad25b7fcd 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -18,8 +18,6 @@ #include "lit-globals.h" -#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */ - /* * Format control characters (ECMA-262 v5, Table 1) */ @@ -37,7 +35,7 @@ #define LIT_CHAR_NBSP ((ecma_char_t) 0x00A0) /* no-break space */ /* LIT_CHAR_BOM is defined above */ -bool lit_char_is_white_space (ecma_char_t c); +bool lit_char_is_white_space (lit_code_point_t c); /* * Line terminator characters (ECMA-262 v5, Table 3) @@ -219,10 +217,8 @@ uint32_t lit_char_hex_to_int (ecma_char_t c); size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point); size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point); void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p); - -/* read a hex encoded code point from a zero terminated buffer */ -bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters, - ecma_char_t *out_code_unit_p); +uint32_t lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, const lit_utf8_byte_t *const buf_end_p, uint32_t lookup); +uint32_t lit_parse_decimal (const lit_utf8_byte_t **buffer_p, const lit_utf8_byte_t *const buffer_end_p); /** * Null character @@ -232,7 +228,7 @@ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t /* * Part of IsWordChar abstract operation (ECMA-262 v5, 15.10.2.6, step 3) */ -bool lit_char_is_word_char (ecma_char_t c); +bool lit_char_is_word_char (lit_code_point_t c); /* * Utility functions for uppercasing / lowercasing diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index 6f3b2ca096..c2fbb35f0f 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -513,7 +513,7 @@ lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with ch * * @return next code unit */ -ecma_char_t +ecma_char_t JERRY_ATTR_NOINLINE lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ { JERRY_ASSERT (buf_p != NULL); @@ -529,7 +529,7 @@ lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha * * @return previous code unit */ -ecma_char_t +ecma_char_t JERRY_ATTR_NOINLINE lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ { JERRY_ASSERT (buf_p != NULL); @@ -543,7 +543,7 @@ lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha /** * Increase cesu-8 encoded string pointer by one code unit. */ -void +inline void JERRY_ATTR_ALWAYS_INLINE lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ { JERRY_ASSERT (*buf_p); diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c index d327fb321c..2655d711ad 100644 --- a/jerry-core/parser/js/js-lexer.c +++ b/jerry-core/parser/js/js-lexer.c @@ -2847,9 +2847,6 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */ context_p->literal_count++; /* Compile the RegExp literal and store the RegExp bytecode pointer */ - const re_compiled_code_t *re_bytecode_p = NULL; - ecma_value_t completion_value; - ecma_string_t *pattern_str_p = NULL; if (lit_is_valid_cesu8_string (regex_start_p, length)) @@ -2862,19 +2859,14 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */ pattern_str_p = ecma_new_ecma_string_from_utf8_converted_to_cesu8 (regex_start_p, length); } - completion_value = re_compile_bytecode (&re_bytecode_p, - pattern_str_p, - current_flags); + re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p, current_flags); ecma_deref_ecma_string (pattern_str_p); - if (ECMA_IS_VALUE_ERROR (completion_value)) + if (JERRY_UNLIKELY (re_bytecode_p == NULL)) { - jcontext_release_exception (); parser_raise_error (context_p, PARSER_ERR_INVALID_REGEXP); } - ecma_free_value (completion_value); - literal_p->type = LEXER_REGEXP_LITERAL; literal_p->u.bytecode_p = (ecma_compiled_code_t *) re_bytecode_p; diff --git a/jerry-core/parser/js/js-parser.c b/jerry-core/parser/js/js-parser.c index 3dd23f005c..519440b56c 100644 --- a/jerry-core/parser/js/js-parser.c +++ b/jerry-core/parser/js/js-parser.c @@ -2723,6 +2723,14 @@ parser_parse_script (const uint8_t *arg_list_p, /**< function argument list */ jcontext_raise_exception (ECMA_VALUE_NULL); return ECMA_VALUE_ERROR; } + + if (parser_error.error == PARSER_ERR_INVALID_REGEXP) + { + /* The RegExp compiler has already raised an exception. */ + JERRY_ASSERT (jcontext_has_pending_exception ()); + return ECMA_VALUE_ERROR; + } + #if ENABLED (JERRY_ERROR_MESSAGES) const lit_utf8_byte_t *err_bytes_p = (const lit_utf8_byte_t *) parser_error_to_string (parser_error.error); lit_utf8_size_t err_bytes_size = lit_zt_utf8_string_size (err_bytes_p); diff --git a/jerry-core/parser/regexp/re-bytecode.c b/jerry-core/parser/regexp/re-bytecode.c index 1722f0c2b7..151d1baa57 100644 --- a/jerry-core/parser/regexp/re-bytecode.c +++ b/jerry-core/parser/regexp/re-bytecode.c @@ -14,8 +14,9 @@ */ #include "ecma-globals.h" -#include "re-bytecode.h" #include "ecma-regexp-object.h" +#include "lit-strings.h" +#include "re-bytecode.h" #if ENABLED (JERRY_BUILTIN_REGEXP) @@ -29,135 +30,103 @@ * @{ */ -/** - * Size of block of RegExp bytecode. Used for allocation - * - * @return pointer to the RegExp compiled code header - */ -#define REGEXP_BYTECODE_BLOCK_SIZE 8UL - void -re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +re_initialize_regexp_bytecode (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */ { - const size_t initial_size = JERRY_ALIGNUP (REGEXP_BYTECODE_BLOCK_SIZE + sizeof (re_compiled_code_t), JMEM_ALIGNMENT); - bc_ctx_p->block_start_p = jmem_heap_alloc_block (initial_size); - bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + initial_size; - bc_ctx_p->current_p = bc_ctx_p->block_start_p + sizeof (re_compiled_code_t); + const size_t initial_size = sizeof (re_compiled_code_t); + re_ctx_p->bytecode_start_p = jmem_heap_alloc_block (initial_size); + re_ctx_p->bytecode_size = initial_size; } /* re_initialize_regexp_bytecode */ -/** - * Realloc the bytecode container - * - * @return current position in RegExp bytecode - */ -static uint8_t * -re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +inline uint32_t JERRY_ATTR_ALWAYS_INLINE +re_bytecode_size (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */ { - JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p); - const size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p); - - /* If one of the members of RegExp bytecode context is NULL, then all member should be NULL - * (it means first allocation), otherwise all of the members should be a non NULL pointer. */ - JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p) - || (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p)); - - const size_t new_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE; - JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p); - const size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p); - - bc_ctx_p->block_start_p = jmem_heap_realloc_block (bc_ctx_p->block_start_p, - old_size, - new_size); - bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + new_size; - bc_ctx_p->current_p = bc_ctx_p->block_start_p + current_ptr_offset; - - return bc_ctx_p->current_p; -} /* re_realloc_regexp_bytecode_block */ + return (uint32_t) re_ctx_p->bytecode_size; +} /* re_bytecode_size */ /** * Append a new bytecode to the and of the bytecode container */ static uint8_t * -re_bytecode_reserve (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ +re_bytecode_reserve (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ const size_t size) /**< size */ { - JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE); - - uint8_t *current_p = bc_ctx_p->current_p; - if (current_p + size > bc_ctx_p->block_end_p) - { - current_p = re_realloc_regexp_bytecode_block (bc_ctx_p); - } - - bc_ctx_p->current_p += size; - return current_p; + const size_t old_size = re_ctx_p->bytecode_size; + const size_t new_size = old_size + size; + re_ctx_p->bytecode_start_p = jmem_heap_realloc_block (re_ctx_p->bytecode_start_p, old_size, new_size); + re_ctx_p->bytecode_size = new_size; + return re_ctx_p->bytecode_start_p + old_size; } /* re_bytecode_reserve */ /** * Insert a new bytecode to the bytecode container */ -static void -re_bytecode_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ +static uint8_t * +re_bytecode_insert (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ const size_t offset, /**< distance from the start of the container */ const size_t size) /**< size */ { - JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE); - - uint8_t *current_p = bc_ctx_p->current_p; - if (current_p + size > bc_ctx_p->block_end_p) - { - re_realloc_regexp_bytecode_block (bc_ctx_p); - } + const size_t tail_size = re_ctx_p->bytecode_size - offset; + re_bytecode_reserve (re_ctx_p, size); - uint8_t *dest_p = bc_ctx_p->block_start_p + offset; - const size_t bytecode_length = re_get_bytecode_length (bc_ctx_p); - if (bytecode_length - offset > 0) - { - memmove (dest_p + size, dest_p, bytecode_length - offset); - } + uint8_t *dest_p = re_ctx_p->bytecode_start_p + offset; + memmove (dest_p + size, dest_p, tail_size); - bc_ctx_p->current_p += size; + return dest_p; } /* re_bytecode_insert */ /** - * Encode ecma_char_t into bytecode + * Append a byte */ -static void -re_encode_char (uint8_t *dest_p, /**< destination */ - const ecma_char_t c) /**< character */ +void +re_append_byte (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint8_t byte) /**< byte value */ { - *dest_p++ = (uint8_t) ((c >> 8) & 0xFF); - *dest_p = (uint8_t) (c & 0xFF); -} /* re_encode_char */ + uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, sizeof (uint8_t)); + *dest_p = byte; +} /* re_append_byte */ /** - * Encode uint32_t into bytecode + * Insert a byte value */ -static void -re_encode_u32 (uint8_t *dest_p, /**< destination */ - const uint32_t u) /**< uint32 value */ +void +re_insert_byte (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint32_t offset, /**< distance from the start of the container */ + const uint8_t byte) /**< byte value */ { - *dest_p++ = (uint8_t) ((u >> 24) & 0xFF); - *dest_p++ = (uint8_t) ((u >> 16) & 0xFF); - *dest_p++ = (uint8_t) ((u >> 8) & 0xFF); - *dest_p = (uint8_t) (u & 0xFF); -} /* re_encode_u32 */ + uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, sizeof (uint8_t)); + *dest_p = byte; +} /* re_insert_byte */ /** - * Get a character from the RegExp bytecode and increase the bytecode position - * - * @return ecma character + * Get a single byte and icnrease bytecode position. */ -inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE -re_get_char (const uint8_t **bc_p) /**< pointer to bytecode start */ +inline uint8_t JERRY_ATTR_ALWAYS_INLINE +re_get_byte (const uint8_t **bc_p) /**< pointer to bytecode start */ { - const uint8_t *src_p = *bc_p; - ecma_char_t chr = (ecma_char_t) *src_p++; - chr = (ecma_char_t) (chr << 8); - chr = (ecma_char_t) (chr | *src_p); - (*bc_p) += sizeof (ecma_char_t); - return chr; -} /* re_get_char */ + return *((*bc_p)++); +} /* re_get_byte */ + +/** + * Append a RegExp opcode + */ +inline void JERRY_ATTR_ALWAYS_INLINE +re_append_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const re_opcode_t opcode) /**< input opcode */ +{ + re_append_byte (re_ctx_p, (uint8_t) opcode); +} /* re_append_opcode */ + +/** + * Insert a RegExp opcode + */ +inline void JERRY_ATTR_ALWAYS_INLINE +re_insert_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint32_t offset, /**< distance from the start of the container */ + const re_opcode_t opcode) /**< input opcode */ +{ + re_insert_byte (re_ctx_p, offset, (uint8_t) opcode); +} /* re_insert_opcode */ /** * Get a RegExp opcode and increase the bytecode position @@ -167,318 +136,497 @@ re_get_char (const uint8_t **bc_p) /**< pointer to bytecode start */ inline re_opcode_t JERRY_ATTR_ALWAYS_INLINE re_get_opcode (const uint8_t **bc_p) /**< pointer to bytecode start */ { - return (re_opcode_t) *((*bc_p)++); + return (re_opcode_t) re_get_byte (bc_p); } /* re_get_opcode */ /** - * Get a parameter of a RegExp opcode and increase the bytecode position + * Encode 2 byte unsigned integer into the bytecode + */ +static void +re_encode_u16 (uint8_t *dest_p, /**< destination */ + const uint16_t value) /**< value */ +{ + *dest_p++ = (uint8_t) ((value >> 8) & 0xFF); + *dest_p = (uint8_t) (value & 0xFF); +} /* re_encode_u16 */ + +/** + * Encode 4 byte unsigned integer into the bytecode + */ +static void +re_encode_u32 (uint8_t *dest_p, /**< destination */ + const uint32_t value) /**< value */ +{ + *dest_p++ = (uint8_t) ((value >> 24) & 0xFF); + *dest_p++ = (uint8_t) ((value >> 16) & 0xFF); + *dest_p++ = (uint8_t) ((value >> 8) & 0xFF); + *dest_p = (uint8_t) (value & 0xFF); +} /* re_encode_u32 */ + +/** + * Decode 2 byte unsigned integer from bytecode * - * @return opcode parameter + * @return uint16_t value */ -inline uint32_t JERRY_ATTR_ALWAYS_INLINE -re_get_value (const uint8_t **bc_p) /**< pointer to bytecode start */ +static uint16_t +re_decode_u16 (const uint8_t *src_p) /**< source */ { - const uint8_t *src_p = *bc_p; - uint32_t value = (uint32_t) (*src_p++); - value <<= 8; - value |= ((uint32_t) (*src_p++)); - value <<= 8; - value |= ((uint32_t) (*src_p++)); - value <<= 8; - value |= ((uint32_t) (*src_p++)); - - (*bc_p) += sizeof (uint32_t); + uint16_t value = (uint16_t) (((uint16_t) *src_p++) << 8); + value = (uint16_t) (value + *src_p++); return value; -} /* re_get_value */ +} /* re_decode_u16 */ /** - * Get length of bytecode + * Decode 4 byte unsigned integer from bytecode * - * @return bytecode length (unsigned integer) + * @return uint32_t value */ -inline uint32_t JERRY_ATTR_PURE JERRY_ATTR_ALWAYS_INLINE -re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +static uint32_t JERRY_ATTR_NOINLINE +re_decode_u32 (const uint8_t *src_p) /**< source */ { - return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p)); -} /* re_get_bytecode_length */ + uint32_t value = (uint32_t) (((uint32_t) *src_p++) << 24); + value += (uint32_t) (((uint32_t) *src_p++) << 16); + value += (uint32_t) (((uint32_t) *src_p++) << 8); + value += (uint32_t) (*src_p++); + return value; +} /* re_decode_u32 */ /** - * Append a RegExp opcode + * Get the encoded size of an uint32_t value. + * + * @return encoded value size */ -void -re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - const re_opcode_t opcode) /**< input opcode */ +inline static size_t JERRY_ATTR_ALWAYS_INLINE +re_get_encoded_value_size (uint32_t value) /**< value */ { - uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint8_t)); - *dest_p = (uint8_t) opcode; -} /* re_append_opcode */ + if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX)) + { + return 1; + } -/** - * Append a parameter of a RegExp opcode + return 5; +} /* re_get_encoded_value_size */ + +/* + * Encode a value to the specified position in the bytecode. */ -void -re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - const uint32_t value) /**< input value */ +static void +re_encode_value (uint8_t *dest_p, /**< position in bytecode */ + const uint32_t value) /**< value */ { - uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint32_t)); + if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX)) + { + *dest_p = (uint8_t) value; + return; + } + + *dest_p++ = (uint8_t) (RE_VALUE_4BYTE_MARKER); re_encode_u32 (dest_p, value); -} /* re_append_u32 */ +} /* re_encode_value */ /** - * Append a character to the RegExp bytecode + * Append a value to the end of the bytecode. */ void -re_append_char (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - const ecma_char_t input_char) /**< input char */ +re_append_value (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint32_t value) /**< value */ { - uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (ecma_char_t)); - re_encode_char (dest_p, input_char); -} /* re_append_char */ + const size_t size = re_get_encoded_value_size (value); + uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, size); + re_encode_value (dest_p, value); +} /* re_append_value */ /** - * Append a jump offset parameter of a RegExp opcode + * Insert a value into the bytecode at a specific offset. */ void -re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t value) /**< input value */ +re_insert_value (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint32_t offset, /**< bytecode offset */ + const uint32_t value) /**< value */ { - value += (uint32_t) (sizeof (uint32_t)); - re_append_u32 (bc_ctx_p, value); -} /* re_append_jump_offset */ + const size_t size = re_get_encoded_value_size (value); + uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, size); + re_encode_value (dest_p, value); +} /* re_insert_value */ /** - * Insert a RegExp opcode + * Read an encoded value from the bytecode. + * + * @return decoded value + */ +uint32_t JERRY_ATTR_ALWAYS_INLINE +re_get_value (const uint8_t **bc_p) /** refence to bytecode pointer */ +{ + uint32_t value = *(*bc_p)++; + if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX)) + { + return value; + } + + value = re_decode_u32 (*bc_p); + *bc_p += sizeof (uint32_t); + return value; +} /* re_get_value */ + +/** + * Append a character to the RegExp bytecode */ void -re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - const uint32_t offset, /**< distance from the start of the container */ - const re_opcode_t opcode) /**< input opcode */ +re_append_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const lit_code_point_t cp) /**< code point */ { - re_bytecode_insert (bc_ctx_p, offset, sizeof (uint8_t)); - *(bc_ctx_p->block_start_p + offset) = (uint8_t) opcode; -} /* re_insert_opcode */ +#if ENABLED (JERRY_ES2015) + const size_t size = (re_ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t); +#else /* !ENABLED (JERRY_ES2015) */ + JERRY_UNUSED (re_ctx_p); + const size_t size = sizeof (ecma_char_t); +#endif /* !ENABLED (JERRY_ES2015) */ + + uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, size); + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + re_encode_u32 (dest_p, cp); + return; + } +#endif /* ENABLED (JERRY_ES2015) */ + + JERRY_ASSERT (cp <= LIT_UTF16_CODE_UNIT_MAX); + re_encode_u16 (dest_p, (ecma_char_t) cp); +} /* re_append_char */ /** - * Insert a parameter of a RegExp opcode + * Append a character to the RegExp bytecode */ void -re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t offset, /**< distance from the start of the container */ - uint32_t value) /**< input value */ +re_insert_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */ + const uint32_t offset, /**< bytecode offset */ + const lit_code_point_t cp) /**< code point*/ +{ +#if ENABLED (JERRY_ES2015) + const size_t size = (re_ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t); +#else /* !ENABLED (JERRY_ES2015) */ + JERRY_UNUSED (re_ctx_p); + const size_t size = sizeof (ecma_char_t); +#endif /* !ENABLED (JERRY_ES2015) */ + + uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, size); + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + re_encode_u32 (dest_p, cp); + return; + } +#endif /* ENABLED (JERRY_ES2015) */ + + JERRY_ASSERT (cp <= LIT_UTF16_CODE_UNIT_MAX); + re_encode_u16 (dest_p, (ecma_char_t) cp); +} /* re_insert_char */ + +/** + * Decode a character from the bytecode. + * + * @return decoded character + */ +inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE +re_get_char (const uint8_t **bc_p, /**< reference to bytecode pointer */ + bool unicode) /**< full unicode mode */ { - re_bytecode_insert (bc_ctx_p, offset, sizeof (uint32_t)); - re_encode_u32 (bc_ctx_p->block_start_p + offset, value); -} /* re_insert_u32 */ + lit_code_point_t cp; + +#if !ENABLED (JERRY_ES2015) + JERRY_UNUSED (unicode); +#else /* ENABLED (JERRY_ES2015) */ + if (unicode) + { + cp = re_decode_u32 (*bc_p); + *bc_p += sizeof (lit_code_point_t); + } + else +#endif /* ENABLED (JERRY_ES2015) */ + { + cp = re_decode_u16 (*bc_p); + *bc_p += sizeof (ecma_char_t); + } + + return cp; +} /* re_get_char */ #if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) +static uint32_t +re_get_bytecode_offset (const uint8_t *start_p, /**< bytecode start pointer */ + const uint8_t *current_p) /**< current bytecode pointer */ +{ + return (uint32_t) ((uintptr_t) current_p - (uintptr_t) start_p); +} /* re_get_bytecode_offset */ + /** * RegExp bytecode dumper */ void -re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +re_dump_bytecode (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */ { - re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) bc_ctx_p->block_start_p; - JERRY_DEBUG_MSG ("%d ", compiled_code_p->header.status_flags); - JERRY_DEBUG_MSG ("%d ", compiled_code_p->captures_count); - JERRY_DEBUG_MSG ("%d | ", compiled_code_p->non_captures_count); + static const char escape_chars[] = {'d', 'D', 'w', 'W', 's', 'S'}; - const uint8_t *bytecode_p = (const uint8_t *) (compiled_code_p + 1); + re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) re_ctx_p->bytecode_start_p; + JERRY_DEBUG_MSG ("Flags: 0x%x ", compiled_code_p->header.status_flags); + JERRY_DEBUG_MSG ("Capturing groups: %d ", compiled_code_p->captures_count); + JERRY_DEBUG_MSG ("Non-capturing groups: %d\n", compiled_code_p->non_captures_count); - re_opcode_t op; - while ((op = re_get_opcode (&bytecode_p))) + const uint8_t *bytecode_start_p = (const uint8_t *) (compiled_code_p + 1); + const uint8_t *bytecode_p = bytecode_start_p; + + while (true) { + JERRY_DEBUG_MSG ("[%3u] ", (uint32_t) ((uintptr_t) bytecode_p - (uintptr_t) bytecode_start_p)); + re_opcode_t op = *bytecode_p++; switch (op) { - case RE_OP_MATCH: + case RE_OP_ALTERNATIVE_START: { - JERRY_DEBUG_MSG ("MATCH, "); + JERRY_DEBUG_MSG ("ALTERNATIVE_START "); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } - case RE_OP_CHAR: + case RE_OP_ALTERNATIVE_NEXT: { - JERRY_DEBUG_MSG ("CHAR "); - JERRY_DEBUG_MSG ("%c, ", (char) re_get_char (&bytecode_p)); + JERRY_DEBUG_MSG ("ALTERNATIVE_NEXT "); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } - case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START: - { - JERRY_DEBUG_MSG ("N"); - /* FALLTHRU */ - } - case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START: + case RE_OP_NO_ALTERNATIVE: { - JERRY_DEBUG_MSG ("GZ_START "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("NO_ALTERNATIVES\n"); break; } - case RE_OP_CAPTURE_GROUP_START: + case RE_OP_CAPTURING_GROUP_START: { - JERRY_DEBUG_MSG ("START "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("CAPTURING_GROUP_START "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p)); + + const uint32_t qmin = re_get_value (&bytecode_p); + JERRY_DEBUG_MSG ("qmin: %u", qmin); + if (qmin == 0) + { + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG (", tail offset: [%3u]\n", offset); + } + else + { + JERRY_DEBUG_MSG ("\n"); + } + break; } - case RE_OP_CAPTURE_NON_GREEDY_GROUP_END: - { - JERRY_DEBUG_MSG ("N"); - /* FALLTHRU */ - } - case RE_OP_CAPTURE_GREEDY_GROUP_END: + case RE_OP_NON_CAPTURING_GROUP_START: { - JERRY_DEBUG_MSG ("G_END "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("NON_CAPTURING_GROUP_START "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p)); + + const uint32_t qmin = re_get_value (&bytecode_p); + JERRY_DEBUG_MSG ("qmin: %u", qmin); + if (qmin == 0) + { + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG (", tail offset: [%3u]\n", offset); + } + else + { + JERRY_DEBUG_MSG ("\n"); + } + break; } - case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START: + case RE_OP_GREEDY_CAPTURING_GROUP_END: { - JERRY_DEBUG_MSG ("N"); - /* FALLTHRU */ + JERRY_DEBUG_MSG ("GREEDY_CAPTURING_GROUP_END "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); + break; } - case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START: + case RE_OP_LAZY_CAPTURING_GROUP_END: { - JERRY_DEBUG_MSG ("GZ_NC_START "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("LAZY_CAPTURING_GROUP_END "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); break; } - case RE_OP_NON_CAPTURE_GROUP_START: + case RE_OP_GREEDY_NON_CAPTURING_GROUP_END: { - JERRY_DEBUG_MSG ("NC_START "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("GREEDY_NON_CAPTURING_GROUP_END "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); break; } - case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END: + case RE_OP_LAZY_NON_CAPTURING_GROUP_END: { - JERRY_DEBUG_MSG ("N"); - /* FALLTHRU */ + JERRY_DEBUG_MSG ("LAZY_NON_CAPTURING_GROUP_END "); + JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); + break; } - case RE_OP_NON_CAPTURE_GREEDY_GROUP_END: + case RE_OP_GREEDY_ITERATOR: { - JERRY_DEBUG_MSG ("G_NC_END "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("GREEDY_ITERATOR "); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u, ", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } - case RE_OP_SAVE_AT_START: + case RE_OP_LAZY_ITERATOR: { - JERRY_DEBUG_MSG ("RE_START "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("LAZY_ITERATOR "); + JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("qmax: %u, ", re_get_value (&bytecode_p) - RE_QMAX_OFFSET); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } - case RE_OP_SAVE_AND_MATCH: + case RE_OP_ITERATOR_END: { - JERRY_DEBUG_MSG ("RE_END, "); + JERRY_DEBUG_MSG ("ITERATOR_END\n"); break; } - case RE_OP_GREEDY_ITERATOR: + case RE_OP_BACKREFERENCE: { - JERRY_DEBUG_MSG ("GREEDY_ITERATOR "); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("BACKREFERENCE "); + JERRY_DEBUG_MSG ("idx: %d\n", re_get_value (&bytecode_p)); break; } - case RE_OP_NON_GREEDY_ITERATOR: + case RE_OP_ASSERT_LINE_START: { - JERRY_DEBUG_MSG ("NON_GREEDY_ITERATOR "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("ASSERT_LINE_START\n"); break; } - case RE_OP_PERIOD: + case RE_OP_ASSERT_LINE_END: { - JERRY_DEBUG_MSG ("PERIOD "); + JERRY_DEBUG_MSG ("ASSERT_LINE_END\n"); break; } - case RE_OP_ALTERNATIVE: + case RE_OP_ASSERT_LOOKAHEAD_POS: { - JERRY_DEBUG_MSG ("ALTERNATIVE "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("ASSERT_LOOKAHEAD_POS "); + JERRY_DEBUG_MSG ("qmin: %u, ", *bytecode_p++); + JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p)); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } - case RE_OP_ASSERT_START: + case RE_OP_ASSERT_LOOKAHEAD_NEG: { - JERRY_DEBUG_MSG ("ASSERT_START "); + JERRY_DEBUG_MSG ("ASSERT_LOOKAHEAD_NEG "); + JERRY_DEBUG_MSG ("qmin: %u, ", *bytecode_p++); + JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p)); + const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p); + JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset); break; } case RE_OP_ASSERT_END: { - JERRY_DEBUG_MSG ("ASSERT_END "); + JERRY_DEBUG_MSG ("ASSERT_END\n"); break; } case RE_OP_ASSERT_WORD_BOUNDARY: { - JERRY_DEBUG_MSG ("ASSERT_WORD_BOUNDARY "); + JERRY_DEBUG_MSG ("ASSERT_WORD_BOUNDARY\n"); break; } case RE_OP_ASSERT_NOT_WORD_BOUNDARY: { - JERRY_DEBUG_MSG ("ASSERT_NOT_WORD_BOUNDARY "); + JERRY_DEBUG_MSG ("ASSERT_NOT_WORD_BOUNDARY\n"); break; } - case RE_OP_LOOKAHEAD_POS: + case RE_OP_CLASS_ESCAPE: { - JERRY_DEBUG_MSG ("LOOKAHEAD_POS "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + ecma_class_escape_t escape = (ecma_class_escape_t) *bytecode_p++; + JERRY_DEBUG_MSG ("CLASS_ESCAPE \\%c\n", escape_chars[escape]); + break; + } + case RE_OP_CHAR_CLASS: + { + JERRY_DEBUG_MSG ("CHAR_CLASS "); + uint8_t flags = *bytecode_p++; + uint32_t char_count = (flags & RE_CLASS_HAS_CHARS) ? re_get_value (&bytecode_p) : 0; + uint32_t range_count = (flags & RE_CLASS_HAS_RANGES) ? re_get_value (&bytecode_p) : 0; + + if (flags & RE_CLASS_INVERT) + { + JERRY_DEBUG_MSG ("inverted "); + } + + JERRY_DEBUG_MSG ("escapes: "); + uint8_t escape_count = flags & RE_CLASS_ESCAPE_COUNT_MASK; + while (escape_count--) + { + JERRY_DEBUG_MSG ("\\%c, ", escape_chars[*bytecode_p++]); + } + + JERRY_DEBUG_MSG ("chars: "); + while (char_count--) + { + JERRY_DEBUG_MSG ("\\u%04x, ", re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE)); + } + + JERRY_DEBUG_MSG ("ranges: "); + while (range_count--) + { + const lit_code_point_t begin = re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE); + const lit_code_point_t end = re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE); + JERRY_DEBUG_MSG ("\\u%04x-\\u%04x, ", begin, end); + } + + JERRY_DEBUG_MSG ("\n"); break; } - case RE_OP_LOOKAHEAD_NEG: +#if ENABLED (JERRY_ES2015) + case RE_OP_UNICODE_PERIOD: { - JERRY_DEBUG_MSG ("LOOKAHEAD_NEG "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("UNICODE_PERIOD\n"); break; } - case RE_OP_BACKREFERENCE: +#endif /* ENABLED (JERRY_ES2015) */ + case RE_OP_PERIOD: { - JERRY_DEBUG_MSG ("BACKREFERENCE "); - JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("PERIOD\n"); break; } - case RE_OP_INV_CHAR_CLASS: + case RE_OP_CHAR: { - JERRY_DEBUG_MSG ("INV_"); - /* FALLTHRU */ + JERRY_DEBUG_MSG ("CHAR \\u%04x\n", re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE)); + break; } - case RE_OP_CHAR_CLASS: + case RE_OP_BYTE: { - JERRY_DEBUG_MSG ("CHAR_CLASS "); - uint32_t num_of_class = re_get_value (&bytecode_p); - JERRY_DEBUG_MSG ("%d", num_of_class); - while (num_of_class) - { - if ((compiled_code_p->header.status_flags & RE_FLAG_UNICODE) != 0) - { - JERRY_DEBUG_MSG (" %u", re_get_value (&bytecode_p)); - JERRY_DEBUG_MSG ("-%u", re_get_value (&bytecode_p)); - } - else - { - JERRY_DEBUG_MSG (" %u", re_get_char (&bytecode_p)); - JERRY_DEBUG_MSG ("-%u", re_get_char (&bytecode_p)); - } - num_of_class--; - } - JERRY_DEBUG_MSG (", "); + const uint8_t ch = *bytecode_p++; + JERRY_DEBUG_MSG ("BYTE \\u%04x '%c'\n", ch, (char) ch); break; } + case RE_OP_EOF: + { + JERRY_DEBUG_MSG ("EOF\n"); + return; + } default: { - JERRY_DEBUG_MSG ("UNKNOWN(%d), ", (uint32_t) op); + JERRY_DEBUG_MSG ("UNKNOWN(%d)\n", (uint32_t) op); break; } } } - JERRY_DEBUG_MSG ("EOF\n"); } /* re_dump_bytecode */ #endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */ diff --git a/jerry-core/parser/regexp/re-bytecode.h b/jerry-core/parser/regexp/re-bytecode.h index 715170bb18..2a293a198c 100644 --- a/jerry-core/parser/regexp/re-bytecode.h +++ b/jerry-core/parser/regexp/re-bytecode.h @@ -19,6 +19,7 @@ #if ENABLED (JERRY_BUILTIN_REGEXP) #include "ecma-globals.h" +#include "re-compiler-context.h" /** \addtogroup parser Parser * @{ @@ -40,43 +41,57 @@ */ #define RE_FLAGS_MASK 0x3F +/** + * Maximum value that can be encoded in the RegExp bytecode as a single byte. + */ +#define RE_VALUE_1BYTE_MAX 0xFE + +/** + * Marker that signals that the actual value is enocded in the following 4 bytes in the bytecode. + */ +#define RE_VALUE_4BYTE_MARKER 0xFF + /** * RegExp opcodes */ typedef enum { - RE_OP_EOF, - /* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it. - * Change it carefully. Capture opcodes should be at first. - */ - RE_OP_CAPTURE_GROUP_START, /**< group start */ - RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */ - RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */ - RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */ - RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */ - RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */ - RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */ - RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */ - RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */ - RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */ - - RE_OP_MATCH, /**< match */ - RE_OP_CHAR, /**< any character */ - RE_OP_SAVE_AT_START, /**< save at start */ - RE_OP_SAVE_AND_MATCH, /**< save and match */ - RE_OP_PERIOD, /**< "." */ - RE_OP_ALTERNATIVE, /**< "|" */ + RE_OP_EOF, /**< end of pattern */ + + RE_OP_ALTERNATIVE_START, /**< start of alternatives */ + RE_OP_ALTERNATIVE_NEXT, /**< next alternative */ + RE_OP_NO_ALTERNATIVE, /**< no alternative */ + + RE_OP_CAPTURING_GROUP_START, /**< start of a capturing group */ + RE_OP_NON_CAPTURING_GROUP_START, /**< start of a non-capturing group */ + + RE_OP_GREEDY_CAPTURING_GROUP_END, /**< end of a greedy capturing group */ + RE_OP_GREEDY_NON_CAPTURING_GROUP_END, /**< end of a greedy non-capturing group */ + RE_OP_LAZY_CAPTURING_GROUP_END, /**< end of a lazy capturing group */ + RE_OP_LAZY_NON_CAPTURING_GROUP_END, /**< end of a lazy non-capturing group */ + RE_OP_GREEDY_ITERATOR, /**< greedy iterator */ - RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */ - RE_OP_ASSERT_START, /**< "^" */ - RE_OP_ASSERT_END, /**< "$" */ - RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */ - RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */ - RE_OP_LOOKAHEAD_POS, /**< lookahead pos */ - RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */ - RE_OP_BACKREFERENCE, /**< "\[0..9]" */ - RE_OP_CHAR_CLASS, /**< "[ ]" */ - RE_OP_INV_CHAR_CLASS /**< "[^ ]" */ + RE_OP_LAZY_ITERATOR, /**< lazy iterator */ + RE_OP_ITERATOR_END, /*** end of an iterator */ + + RE_OP_BACKREFERENCE, /**< backreference */ + + RE_OP_ASSERT_LINE_START, /**< line start assertion */ + RE_OP_ASSERT_LINE_END, /**< line end assertion */ + RE_OP_ASSERT_WORD_BOUNDARY, /**< word boundary assertion */ + RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< not word boundary assertion */ + RE_OP_ASSERT_LOOKAHEAD_POS, /**< positive lookahead assertion */ + RE_OP_ASSERT_LOOKAHEAD_NEG, /**< negative lookahead assertion */ + RE_OP_ASSERT_END, /**< end of an assertion */ + + RE_OP_CLASS_ESCAPE, /**< class escape */ + RE_OP_CHAR_CLASS, /**< character class */ +#if ENABLED (JERRY_ES2015) + RE_OP_UNICODE_PERIOD, /**< period in full unicode mode */ +#endif /* ENABLED (JERRY_ES2015) */ + RE_OP_PERIOD, /**< period in non-unicode mode */ + RE_OP_CHAR, /**< any code point */ + RE_OP_BYTE, /**< 1-byte utf8 character */ } re_opcode_t; /** @@ -85,42 +100,31 @@ typedef enum typedef struct { ecma_compiled_code_t header; /**< compiled code header */ + uint32_t captures_count; /**< number of capturing groups */ + uint32_t non_captures_count; /**< number of non-capturing groups */ ecma_value_t source; /**< original RegExp pattern */ - uint32_t captures_count; /**< number of capturing brackets */ - uint32_t non_captures_count; /**< number of non capturing brackets */ } re_compiled_code_t; -/** - * Context of RegExp bytecode container - */ -typedef struct -{ - uint8_t *block_start_p; /**< start of bytecode block */ - uint8_t *block_end_p; /**< end of bytecode block */ - uint8_t *current_p; /**< current position in bytecode */ -} re_bytecode_ctx_t; +void re_initialize_regexp_bytecode (re_compiler_ctx_t *re_ctx_p); +uint32_t re_bytecode_size (re_compiler_ctx_t *re_ctx_p); -re_opcode_t re_get_opcode (const uint8_t **bc_p); -ecma_char_t re_get_char (const uint8_t **bc_p); -uint32_t re_get_value (const uint8_t **bc_p); -uint32_t JERRY_ATTR_PURE re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p); - -void re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p); +void re_append_opcode (re_compiler_ctx_t *re_ctx_p, const re_opcode_t opcode); +void re_append_byte (re_compiler_ctx_t *re_ctx_p, const uint8_t byte); +void re_append_char (re_compiler_ctx_t *re_ctx_p, const lit_code_point_t cp); +void re_append_value (re_compiler_ctx_t *re_ctx_p, const uint32_t value); -void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, const re_opcode_t opcode); -void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t value); -void re_append_char (re_bytecode_ctx_t *bc_ctx_p, const ecma_char_t input_char); -void re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, uint32_t value); +void re_insert_opcode (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const re_opcode_t opcode); +void re_insert_byte (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint8_t byte); +void re_insert_char (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const lit_code_point_t cp); +void re_insert_value (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint32_t value); -void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const re_opcode_t opcode); -void re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const uint32_t value); -void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, - const size_t offset, - const uint8_t *bytecode_p, - const size_t length); +re_opcode_t re_get_opcode (const uint8_t **bc_p); +uint8_t re_get_byte (const uint8_t **bc_p); +lit_code_point_t re_get_char (const uint8_t **bc_p, bool unicode); +uint32_t re_get_value (const uint8_t **bc_p); #if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) -void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx); +void re_dump_bytecode (re_compiler_ctx_t *bc_ctx); #endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */ /** diff --git a/jerry-core/parser/regexp/re-compiler-context.h b/jerry-core/parser/regexp/re-compiler-context.h new file mode 100644 index 0000000000..6d7b7537e8 --- /dev/null +++ b/jerry-core/parser/regexp/re-compiler-context.h @@ -0,0 +1,60 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RE_COMPILER_CONTEXT_H +#define RE_COMPILER_CONTEXT_H + +#if ENABLED (JERRY_BUILTIN_REGEXP) + +#include "re-token.h" + +/** \addtogroup parser Parser + * @{ + * + * \addtogroup regexparser Regular expression + * @{ + * + * \addtogroup regexparser_compiler Compiler + * @{ + */ + +/** + * RegExp compiler context + */ +typedef struct +{ + const lit_utf8_byte_t *input_start_p; /**< start of input pattern */ + const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */ + const lit_utf8_byte_t *input_end_p; /**< end of input pattern */ + + uint8_t *bytecode_start_p; /**< start of bytecode block */ + size_t bytecode_size; /**< size of bytecode */ + + uint32_t captures_count; /**< number of capture groups */ + uint32_t non_captures_count; /**< number of non-capture groups */ + + int groups_count; /**< number of groups */ + uint16_t flags; /**< RegExp flags */ + re_token_t token; /**< current token */ +} re_compiler_ctx_t; + +/** + * @} + * @} + * @} + */ + +#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */ +#endif /* !RE_COMPILER_CONTEXT_H */ diff --git a/jerry-core/parser/regexp/re-compiler.c b/jerry-core/parser/regexp/re-compiler.c index f82f890925..c28fd17005 100644 --- a/jerry-core/parser/regexp/re-compiler.c +++ b/jerry-core/parser/regexp/re-compiler.c @@ -23,6 +23,7 @@ #include "jmem.h" #include "re-bytecode.h" #include "re-compiler.h" +#include "re-compiler-context.h" #include "re-parser.h" #if ENABLED (JERRY_BUILTIN_REGEXP) @@ -38,896 +39,140 @@ */ /** - * Insert simple atom iterator + * Search for the given pattern in the RegExp cache. * - * @return empty ecma value - if inserted successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value + * @return pointer to bytecode if found + * NULL - otherwise */ -static ecma_value_t -re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t new_atom_start_offset) /**< atom start offset */ +static re_compiled_code_t * +re_cache_lookup (ecma_string_t *pattern_str_p, /**< pattern string */ + uint16_t flags) /**< flags */ { - uint32_t atom_code_length; - uint32_t offset; - uint32_t qmin, qmax; - - qmin = re_ctx_p->current_token.qmin; - qmax = re_ctx_p->current_token.qmax; + re_compiled_code_t **cache_p = JERRY_CONTEXT (re_cache); - if (qmin == 1 && qmax == 1) - { - return ECMA_VALUE_EMPTY; - } - else if (qmin > qmax) - { - /* ECMA-262 v5.1 15.10.2.5 */ - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: min > max.")); - } - - /* TODO: optimize bytecode length. Store 0 rather than INF */ - - re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */ - uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset); - - offset = new_atom_start_offset; - re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length); - re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax); - re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin); - if (re_ctx_p->current_token.greedy) - { - re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR); - } - else + for (uint8_t idx = 0u; idx < RE_CACHE_SIZE; idx++) { - re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR); - } - - return ECMA_VALUE_EMPTY; -} /* re_insert_simple_iterator */ + re_compiled_code_t *cached_bytecode_p = cache_p[idx]; -/** - * Get the type of a group start - * - * @return RegExp opcode - */ -static re_opcode_t -re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool is_capturable) /**< is capturable group */ -{ - if (is_capturable) - { - if (re_ctx_p->current_token.qmin == 0) + if (cached_bytecode_p == NULL) { - if (re_ctx_p->current_token.greedy) - { - return RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START; - } - - return RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START; - } - - return RE_OP_CAPTURE_GROUP_START; - } - - if (re_ctx_p->current_token.qmin == 0) - { - if (re_ctx_p->current_token.greedy) - { - return RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START; + break; } - return RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START; - } - - return RE_OP_NON_CAPTURE_GROUP_START; -} /* re_get_start_opcode_type */ + ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->source); -/** - * Get the type of a group end - * - * @return RegExp opcode - */ -static re_opcode_t -re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool is_capturable) /**< is capturable group */ -{ - if (is_capturable) - { - if (re_ctx_p->current_token.greedy) + if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags + && ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p)) { - return RE_OP_CAPTURE_GREEDY_GROUP_END; + return cached_bytecode_p; } - - return RE_OP_CAPTURE_NON_GREEDY_GROUP_END; - } - - if (re_ctx_p->current_token.greedy) - { - return RE_OP_NON_CAPTURE_GREEDY_GROUP_END; } - return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END; -} /* re_get_end_opcode_type */ + return NULL; +} /* re_cache_lookup */ /** - * Enclose the given bytecode to a group - * - * @return empty ecma value - if inserted successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value + * Run garbage collection in RegExp cache. */ -static ecma_value_t -re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t group_start_offset, /**< offset of group start */ - uint32_t idx, /**< index of group */ - bool is_capturable) /**< is capturable group */ -{ - uint32_t qmin = re_ctx_p->current_token.qmin; - uint32_t qmax = re_ctx_p->current_token.qmax; - - if (qmin > qmax) - { - /* ECMA-262 v5.1 15.10.2.5 */ - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: min > max.")); - } - - re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable); - re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable); - - uint32_t start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx); - re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode); - start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len; - re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode); - re_append_u32 (re_ctx_p->bytecode_ctx_p, idx); - re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin); - re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax); - - group_start_offset += start_head_offset_len; - re_append_jump_offset (re_ctx_p->bytecode_ctx_p, - re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); - - if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START) - { - re_insert_u32 (re_ctx_p->bytecode_ctx_p, - group_start_offset, - re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); - } - - return ECMA_VALUE_EMPTY; -} /* re_insert_into_group */ - -/** - * Enclose the given bytecode to a group and inster jump value - * - * @return empty ecma value - if inserted successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value - */ -static ecma_value_t -re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t group_start_offset, /**< offset of group start */ - uint32_t idx, /**< index of group */ - bool is_capturable) /**< is capturable group */ -{ - re_insert_u32 (re_ctx_p->bytecode_ctx_p, - group_start_offset, - re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); - return re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); -} /* re_insert_into_group_with_jump */ - -/** - * Append a character class range to the bytecode - */ -static void -re_append_char_class (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - lit_code_point_t start, /**< character class range from */ - lit_code_point_t end) /**< character class range to */ -{ - re_ctx_p->parser_ctx_p->classes_count++; - -#if ENABLED (JERRY_ES2015) - if (re_ctx_p->flags & RE_FLAG_UNICODE) - { - re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); - re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); - return; - } -#endif /* ENABLED (JERRY_ES2015) */ - - JERRY_ASSERT (start <= LIT_UTF16_CODE_UNIT_MAX); - JERRY_ASSERT (end <= LIT_UTF16_CODE_UNIT_MAX); - - re_append_char (re_ctx_p->bytecode_ctx_p, - (ecma_char_t) ecma_regexp_canonicalize (start, - re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); - re_append_char (re_ctx_p->bytecode_ctx_p, - (ecma_char_t) ecma_regexp_canonicalize (end, - re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); -} /* re_append_char_class */ - -/** - * Read the input pattern and parse the range of character class - * - * @return empty ecma value - if parsed successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value - */ -static ecma_value_t -re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ - re_token_t *out_token_p) /**< [out] output token */ +void +re_cache_gc (void) { - re_parser_ctx_t *const parser_ctx_p = re_ctx_p->parser_ctx_p; - out_token_p->qmax = out_token_p->qmin = 1; - parser_ctx_p->classes_count = 0; - - lit_code_point_t start = LIT_CHAR_UNDEF; - bool is_range = false; - const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS - || re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS); - - const ecma_char_t prev_char = lit_cesu8_peek_prev (parser_ctx_p->input_curr_p); - if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX) - { - lit_utf8_decr (&parser_ctx_p->input_curr_p); - lit_utf8_decr (&parser_ctx_p->input_curr_p); - } + re_compiled_code_t **cache_p = JERRY_CONTEXT (re_cache); - do + for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++) { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string")); - } + const re_compiled_code_t *cached_bytecode_p = cache_p[i]; - lit_code_point_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); - - if (ch == LIT_CHAR_RIGHT_SQUARE) + if (cached_bytecode_p == NULL) { - if (start != LIT_CHAR_UNDEF) - { - re_append_char_class (re_ctx_p, start, start); - } break; } - else if (ch == LIT_CHAR_MINUS) - { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'")); - } - - if (start != LIT_CHAR_UNDEF - && !is_range - && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE) - { - is_range = true; - continue; - } - } - else if (ch == LIT_CHAR_BACKSLASH) - { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'")); - } - - ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); - - if (ch == LIT_CHAR_LOWERCASE_B) - { - ch = LIT_CHAR_BS; - } - else if (ch == LIT_CHAR_LOWERCASE_F) - { - ch = LIT_CHAR_FF; - } - else if (ch == LIT_CHAR_LOWERCASE_N) - { - ch = LIT_CHAR_LF; - } - else if (ch == LIT_CHAR_LOWERCASE_T) - { - ch = LIT_CHAR_TAB; - } - else if (ch == LIT_CHAR_LOWERCASE_R) - { - ch = LIT_CHAR_CR; - } - else if (ch == LIT_CHAR_LOWERCASE_V) - { - ch = LIT_CHAR_VTAB; - } - else if (ch == LIT_CHAR_LOWERCASE_C) - { - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) - { - ch = *parser_ctx_p->input_curr_p; - - if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) - || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END) - || (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9)) - { - /* See ECMA-262 v5, 15.10.2.10 (Point 3) */ - ch = (ch % 32); - parser_ctx_p->input_curr_p++; - } - else - { - ch = LIT_CHAR_LOWERCASE_C; - } - } - } - else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2)) - { - ecma_char_t code_unit; - - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'")); - } - - parser_ctx_p->input_curr_p += 2; - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && is_range == false - && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) - { - start = code_unit; - continue; - } - - ch = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4)) - { - ecma_char_t code_unit; - - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'")); - } - - parser_ctx_p->input_curr_p += 4; - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && is_range == false - && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) - { - start = code_unit; - continue; - } - - ch = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_D) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_D) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_LOWERCASE_S) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR); - re_append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP); - re_append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP); - re_append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */ - re_append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */ - re_append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */ - re_append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS); - re_append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */ - re_append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */ - re_append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */ - re_append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_S) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL); - re_append_char_class (re_ctx_p, 0x1681UL, 0x180DUL); - re_append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL); - re_append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL); - re_append_char_class (re_ctx_p, 0x2030UL, 0x205EUL); - re_append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL); - re_append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_LOWERCASE_W) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9); - re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z); - re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE); - re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_W) - { - /* See ECMA-262 v5, 15.10.2.12 */ - re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1); - re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (lit_char_is_octal_digit ((ecma_char_t) ch)) - { - lit_utf8_decr (&parser_ctx_p->input_curr_p); - ch = (ecma_char_t) re_parse_octal (parser_ctx_p); - } - } /* ch == LIT_CHAR_BACKSLASH */ - -#if ENABLED (JERRY_ES2015) - if (re_ctx_p->flags & RE_FLAG_UNICODE - && lit_is_code_point_utf16_high_surrogate (ch) - && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) - { - const ecma_char_t next_ch = lit_cesu8_peek_next (parser_ctx_p->input_curr_p); - if (lit_is_code_point_utf16_low_surrogate (next_ch)) - { - ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); - lit_utf8_incr (&parser_ctx_p->input_curr_p); - } - } -#endif /* ENABLED (JERRY_ES2015) */ - - if (start != LIT_CHAR_UNDEF) - { - if (is_range) - { - if (start > ch) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order")); - } - else - { - re_append_char_class (re_ctx_p, start, ch); - start = LIT_CHAR_UNDEF; - is_range = false; - } - } - else - { - re_append_char_class (re_ctx_p, start, start); - start = ch; - } - } - else - { - start = ch; - } - } - while (is_char_class); - - return re_parse_iterator (parser_ctx_p, out_token_p); -} /* re_parse_char_class */ - -/** - * Parse alternatives - * - * @return empty ecma value - if alternative was successfully parsed - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value - */ -static ecma_value_t -re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool expect_eof) /**< expect end of file */ -{ - ECMA_CHECK_STACK_USAGE (); - uint32_t idx; - re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p; - ecma_value_t ret_value = ECMA_VALUE_EMPTY; - - uint32_t alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - - while (ecma_is_value_empty (ret_value)) - { - ecma_value_t next_token_result = re_parse_next_token (re_ctx_p->parser_ctx_p, - &(re_ctx_p->current_token)); - if (ECMA_IS_VALUE_ERROR (next_token_result)) - { - return next_token_result; - } - - JERRY_ASSERT (ecma_is_value_empty (next_token_result)); - - uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - - switch (re_ctx_p->current_token.type) - { - case RE_TOK_START_CAPTURE_GROUP: - { - idx = re_ctx_p->captures_count++; - JERRY_TRACE_MSG ("Compile a capture group start (idx: %u)\n", (unsigned int) idx); - - ret_value = re_parse_alternative (re_ctx_p, false); - - if (ecma_is_value_empty (ret_value)) - { - ret_value = re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true); - } - - break; - } - case RE_TOK_START_NON_CAPTURE_GROUP: - { - idx = re_ctx_p->non_captures_count++; - JERRY_TRACE_MSG ("Compile a non-capture group start (idx: %u)\n", (unsigned int) idx); - - ret_value = re_parse_alternative (re_ctx_p, false); - - if (ecma_is_value_empty (ret_value)) - { - ret_value = re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false); - } - - break; - } - case RE_TOK_CHAR: - { - JERRY_TRACE_MSG ("Compile character token: %c, qmin: %u, qmax: %u\n", - (char) re_ctx_p->current_token.value, (unsigned int) re_ctx_p->current_token.qmin, - (unsigned int) re_ctx_p->current_token.qmax); - - re_append_opcode (bc_ctx_p, RE_OP_CHAR); - re_append_char (bc_ctx_p, (ecma_char_t) ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, - re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); - - ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); - break; - } - case RE_TOK_PERIOD: - { - JERRY_TRACE_MSG ("Compile a period\n"); - re_append_opcode (bc_ctx_p, RE_OP_PERIOD); - - ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); - break; - } - case RE_TOK_ALTERNATIVE: - { - JERRY_TRACE_MSG ("Compile an alternative\n"); - re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); - re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); - alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - break; - } - case RE_TOK_ASSERT_START: - { - JERRY_TRACE_MSG ("Compile a start assertion\n"); - re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START); - break; - } - case RE_TOK_ASSERT_END: - { - JERRY_TRACE_MSG ("Compile an end assertion\n"); - re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END); - break; - } - case RE_TOK_ASSERT_WORD_BOUNDARY: - { - JERRY_TRACE_MSG ("Compile a word boundary assertion\n"); - re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); - break; - } - case RE_TOK_ASSERT_NOT_WORD_BOUNDARY: - { - JERRY_TRACE_MSG ("Compile a not word boundary assertion\n"); - re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); - break; - } - case RE_TOK_ASSERT_START_POS_LOOKAHEAD: - { - JERRY_TRACE_MSG ("Compile a positive lookahead assertion\n"); - idx = re_ctx_p->non_captures_count++; - re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); - - ret_value = re_parse_alternative (re_ctx_p, false); - - if (ecma_is_value_empty (ret_value)) - { - re_append_opcode (bc_ctx_p, RE_OP_MATCH); - - ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); - } - - break; - } - case RE_TOK_ASSERT_START_NEG_LOOKAHEAD: - { - JERRY_TRACE_MSG ("Compile a negative lookahead assertion\n"); - idx = re_ctx_p->non_captures_count++; - re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); - - ret_value = re_parse_alternative (re_ctx_p, false); - - if (ecma_is_value_empty (ret_value)) - { - re_append_opcode (bc_ctx_p, RE_OP_MATCH); - - ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); - } - - break; - } - case RE_TOK_BACKREFERENCE: - { - uint32_t backref = (uint32_t) re_ctx_p->current_token.value; - idx = re_ctx_p->non_captures_count++; - - if (backref > re_ctx_p->highest_backref) - { - re_ctx_p->highest_backref = backref; - } - - JERRY_TRACE_MSG ("Compile a backreference: %u\n", (unsigned int) backref); - re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE); - re_append_u32 (bc_ctx_p, backref); - - ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); - break; - } - case RE_TOK_DIGIT: - case RE_TOK_NOT_DIGIT: - case RE_TOK_WHITE: - case RE_TOK_NOT_WHITE: - case RE_TOK_WORD_CHAR: - case RE_TOK_NOT_WORD_CHAR: - case RE_TOK_START_CHAR_CLASS: - case RE_TOK_START_INV_CHAR_CLASS: - { - JERRY_TRACE_MSG ("Compile a character class\n"); - re_append_opcode (bc_ctx_p, - re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS - ? RE_OP_INV_CHAR_CLASS - : RE_OP_CHAR_CLASS); - uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - - ret_value = re_parse_char_class (re_ctx_p, - &(re_ctx_p->current_token)); - - if (!ECMA_IS_VALUE_ERROR (ret_value)) - { - re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->classes_count); - ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); - } - - break; - } - case RE_TOK_END_GROUP: - { - JERRY_TRACE_MSG ("Compile a group end\n"); - - if (expect_eof) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of paren.")); - } - - re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); - return ECMA_VALUE_EMPTY; - } - case RE_TOK_EOF: - { - if (!expect_eof) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern.")); - } - - re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); - return ECMA_VALUE_EMPTY; - } - default: - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected RegExp token.")); - } - } - } - - return ret_value; -} /* re_parse_alternative */ - -/** - * Search for the given pattern in the RegExp cache - * - * @return index of bytecode in cache - if found - * RE_CACHE_SIZE - otherwise - */ -static uint8_t -re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, /**< pattern string */ - uint16_t flags) /**< flags */ -{ - uint8_t free_idx = RE_CACHE_SIZE; - - for (uint8_t idx = 0u; idx < RE_CACHE_SIZE; idx++) - { - const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[idx]; - if (cached_bytecode_p != NULL) - { - ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->source); - - if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags - && ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p)) - { - JERRY_TRACE_MSG ("RegExp is found in cache\n"); - return idx; - } - } - else - { - /* mark as free, so it can be overridden if the cache is full */ - free_idx = idx; - } + ecma_bytecode_deref ((ecma_compiled_code_t *) cached_bytecode_p); + cache_p[i] = NULL; } - JERRY_TRACE_MSG ("RegExp is NOT found in cache\n"); - return free_idx; -} /* re_find_bytecode_in_cache */ - -/** - * Run gerbage collection in RegExp cache - */ -void -re_cache_gc_run (void) -{ - for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++) - { - const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[i]; - - if (cached_bytecode_p != NULL - && cached_bytecode_p->header.refs == 1) - { - /* Only the cache has reference for the bytecode */ - ecma_bytecode_deref ((ecma_compiled_code_t *) cached_bytecode_p); - JERRY_CONTEXT (re_cache)[i] = NULL; - } - } -} /* re_cache_gc_run */ + JERRY_CONTEXT (re_cache_idx) = 0; +} /* re_cache_gc */ /** * Compilation of RegExp bytecode * - * @return empty ecma value - if bytecode was compiled successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value + * @return pointer to bytecode if compilation was successful + * NULL - otherwise */ -ecma_value_t -re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] pointer to bytecode */ - ecma_string_t *pattern_str_p, /**< pattern */ +re_compiled_code_t * +re_compile_bytecode (ecma_string_t *pattern_str_p, /**< pattern */ uint16_t flags) /**< flags */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; - uint8_t cache_idx = re_find_bytecode_in_cache (pattern_str_p, flags); + re_compiled_code_t *cached_bytecode_p = re_cache_lookup (pattern_str_p, flags); - if (cache_idx < RE_CACHE_SIZE) + if (cached_bytecode_p != NULL) { - *out_bytecode_p = JERRY_CONTEXT (re_cache)[cache_idx]; - - if (*out_bytecode_p != NULL) - { - ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p); - return ret_value; - } + ecma_bytecode_ref ((ecma_compiled_code_t *) cached_bytecode_p); + return cached_bytecode_p; } - /* not in the RegExp cache, so compile it */ re_compiler_ctx_t re_ctx; re_ctx.flags = flags; - re_ctx.highest_backref = 0; + re_ctx.captures_count = 1; re_ctx.non_captures_count = 0; - re_bytecode_ctx_t bc_ctx; - re_ctx.bytecode_ctx_p = &bc_ctx; - re_initialize_regexp_bytecode (&bc_ctx); + re_initialize_regexp_bytecode (&re_ctx); ECMA_STRING_TO_UTF8_STRING (pattern_str_p, pattern_start_p, pattern_start_size); - re_parser_ctx_t parser_ctx; - parser_ctx.input_start_p = pattern_start_p; - parser_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p; - parser_ctx.input_end_p = pattern_start_p + pattern_start_size; - parser_ctx.groups_count = -1; - re_ctx.parser_ctx_p = &parser_ctx; + re_ctx.input_start_p = pattern_start_p; + re_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p; + re_ctx.input_end_p = pattern_start_p + pattern_start_size; + re_ctx.groups_count = -1; /* Parse RegExp pattern */ - re_ctx.captures_count = 1; - re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START); - ecma_value_t result = re_parse_alternative (&re_ctx, true); ECMA_FINALIZE_UTF8_STRING (pattern_start_p, pattern_start_size); if (ECMA_IS_VALUE_ERROR (result)) - { - ret_value = result; - } - /* Check for invalid backreference */ - else if (re_ctx.highest_backref >= re_ctx.captures_count) - { - ret_value = ecma_raise_syntax_error ("Invalid backreference.\n"); - } - else - { - re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH); - re_append_opcode (&bc_ctx, RE_OP_EOF); - - /* Initialize bytecode header */ - re_compiled_code_t *re_compiled_code_p = (re_compiled_code_t *) bc_ctx.block_start_p; - re_compiled_code_p->header.refs = 1; - re_compiled_code_p->header.status_flags = re_ctx.flags; - ecma_ref_ecma_string (pattern_str_p); - re_compiled_code_p->source = ecma_make_string_value (pattern_str_p); - re_compiled_code_p->captures_count = re_ctx.captures_count; - re_compiled_code_p->non_captures_count = re_ctx.non_captures_count; - } - - size_t byte_code_size = (size_t) (bc_ctx.block_end_p - bc_ctx.block_start_p); - - if (!ecma_is_value_empty (ret_value)) { /* Compilation failed, free bytecode. */ - JERRY_TRACE_MSG ("RegExp compilation failed!\n"); - jmem_heap_free_block (bc_ctx.block_start_p, byte_code_size); - *out_bytecode_p = NULL; + jmem_heap_free_block (re_ctx.bytecode_start_p, re_ctx.bytecode_size); + return NULL; } - else - { -#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) - if (JERRY_CONTEXT (jerry_init_flags) & ECMA_INIT_SHOW_REGEXP_OPCODES) - { - re_dump_bytecode (&bc_ctx); - } -#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */ - *out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p; - ((re_compiled_code_t *) bc_ctx.block_start_p)->header.size = (uint16_t) (byte_code_size >> JMEM_ALIGNMENT_LOG); + /* Align bytecode size to JMEM_ALIGNMENT so that it can be stored in the bytecode header. */ + const uint32_t final_size = JERRY_ALIGNUP (re_ctx.bytecode_size, JMEM_ALIGNMENT); + re_compiled_code_t *re_compiled_code_p = (re_compiled_code_t *) jmem_heap_realloc_block (re_ctx.bytecode_start_p, + re_ctx.bytecode_size, + final_size); - if (cache_idx == RE_CACHE_SIZE) - { - if (JERRY_CONTEXT (re_cache_idx) == RE_CACHE_SIZE) - { - JERRY_CONTEXT (re_cache_idx) = 0; - } + /* Bytecoded will be inserted into the cache and returned to the caller, so refcount is implicitly set to 2. */ + re_compiled_code_p->header.refs = 2; + re_compiled_code_p->header.size = (uint16_t) (final_size >> JMEM_ALIGNMENT_LOG); + re_compiled_code_p->header.status_flags = re_ctx.flags; - JERRY_TRACE_MSG ("RegExp cache is full! Remove the element on idx: %d\n", JERRY_CONTEXT (re_cache_idx)); + ecma_ref_ecma_string (pattern_str_p); + re_compiled_code_p->source = ecma_make_string_value (pattern_str_p); + re_compiled_code_p->captures_count = re_ctx.captures_count; + re_compiled_code_p->non_captures_count = re_ctx.non_captures_count; - cache_idx = JERRY_CONTEXT (re_cache_idx)++; +#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) + if (JERRY_CONTEXT (jerry_init_flags) & ECMA_INIT_SHOW_REGEXP_OPCODES) + { + re_dump_bytecode (&re_ctx); + } +#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */ - /* The garbage collector might run during the byte code - * allocations above and it may free this entry. */ - if (JERRY_CONTEXT (re_cache)[cache_idx] != NULL) - { - ecma_bytecode_deref ((ecma_compiled_code_t *) JERRY_CONTEXT (re_cache)[cache_idx]); - } - } + uint8_t cache_idx = JERRY_CONTEXT (re_cache_idx); - JERRY_TRACE_MSG ("Insert bytecode into RegExp cache (idx: %d).\n", cache_idx); - ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p); - JERRY_CONTEXT (re_cache)[cache_idx] = *out_bytecode_p; + if (JERRY_CONTEXT (re_cache)[cache_idx] != NULL) + { + ecma_bytecode_deref ((ecma_compiled_code_t *) JERRY_CONTEXT (re_cache)[cache_idx]); } - return ret_value; + JERRY_CONTEXT (re_cache)[cache_idx] = re_compiled_code_p; + JERRY_CONTEXT (re_cache_idx) = (uint8_t) (cache_idx + 1) % RE_CACHE_SIZE; + + return re_compiled_code_p; } /* re_compile_bytecode */ /** diff --git a/jerry-core/parser/regexp/re-compiler.h b/jerry-core/parser/regexp/re-compiler.h index 8dd2a72e10..b5f1e8a756 100644 --- a/jerry-core/parser/regexp/re-compiler.h +++ b/jerry-core/parser/regexp/re-compiler.h @@ -20,7 +20,6 @@ #include "ecma-globals.h" #include "re-bytecode.h" -#include "re-parser.h" /** \addtogroup parser Parser * @{ @@ -32,24 +31,10 @@ * @{ */ -/** - * Context of RegExp compiler - */ -typedef struct -{ - uint16_t flags; /**< RegExp flags */ - uint32_t captures_count; /**< number of capture groups */ - uint32_t non_captures_count; /**< number of non-capture groups */ - uint32_t highest_backref; /**< highest backreference */ - re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */ - re_token_t current_token; /**< current token */ - re_parser_ctx_t *parser_ctx_p; /**< pointer of RegExp parser context */ -} re_compiler_ctx_t; - -ecma_value_t -re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, ecma_string_t *pattern_str_p, uint16_t flags); - -void re_cache_gc_run (void); +re_compiled_code_t * +re_compile_bytecode (ecma_string_t *pattern_str_p, uint16_t flags); + +void re_cache_gc (void); /** * @} diff --git a/jerry-core/parser/regexp/re-parser.c b/jerry-core/parser/regexp/re-parser.c index 01f305e1bb..3820d679a8 100644 --- a/jerry-core/parser/regexp/re-parser.c +++ b/jerry-core/parser/regexp/re-parser.c @@ -35,234 +35,336 @@ */ /** - * Lookup a character in the input string. + * Get the start opcode for the current group. * - * @return true - if lookup number of characters ahead are hex digits - * false - otherwise + * @return RegExp opcode */ -bool -re_hex_lookup (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ - uint32_t lookup) /**< size of lookup */ +static re_opcode_t +re_get_group_start_opcode (bool is_capturing) /**< is capturing group */ { - const lit_utf8_byte_t *curr_p = parser_ctx_p->input_curr_p; + return (is_capturing) ? RE_OP_CAPTURING_GROUP_START : RE_OP_NON_CAPTURING_GROUP_START; +} /* re_get_group_start_opcode*/ - if (JERRY_UNLIKELY (curr_p + lookup > parser_ctx_p->input_end_p)) +/** + * Get the end opcode for the current group. + * + * @return RegExp opcode + */ +static re_opcode_t +re_get_group_end_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool is_capturing) /**< is capturing group */ +{ + if (is_capturing) { - return false; + if (re_ctx_p->token.greedy) + { + return RE_OP_GREEDY_CAPTURING_GROUP_END; + } + + return RE_OP_LAZY_CAPTURING_GROUP_END; } - for (uint32_t i = 0; i < lookup; i++) + if (re_ctx_p->token.greedy) { - if (!lit_char_is_hex_digit (*curr_p++)) - { - return false; - } + return RE_OP_GREEDY_NON_CAPTURING_GROUP_END; } - return true; -} /* re_hex_lookup */ + return RE_OP_LAZY_NON_CAPTURING_GROUP_END; +} /* re_get_group_end_opcode */ /** - * Consume non greedy (question mark) character if present. - * - * @return true - if non-greedy character found - * false - otherwise + * Enclose the given bytecode to a group. */ -static inline bool JERRY_ATTR_ALWAYS_INLINE -re_parse_non_greedy_char (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ +static void +re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t group_start_offset, /**< offset of group start */ + uint32_t idx, /**< index of group */ + uint32_t capture_start, /**< index of first nested capture */ + bool is_capturing) /**< is capturing group */ { - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && *parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION) + uint32_t qmin = re_ctx_p->token.qmin; + uint32_t qmax = re_ctx_p->token.qmax; + + if (JERRY_UNLIKELY (!is_capturing && re_bytecode_size (re_ctx_p) == group_start_offset)) + { + return; + } + + if (qmin == 0) { - parser_ctx_p->input_curr_p++; - return true; + re_insert_value (re_ctx_p, + group_start_offset, + re_bytecode_size (re_ctx_p) - group_start_offset); } - return false; -} /* re_parse_non_greedy_char */ + re_insert_value (re_ctx_p, group_start_offset, qmin); + re_insert_value (re_ctx_p, group_start_offset, re_ctx_p->captures_count - capture_start); + + if (!is_capturing) + { + re_insert_value (re_ctx_p, group_start_offset, capture_start); + } + else + { + JERRY_ASSERT (idx == capture_start); + } + + re_insert_value (re_ctx_p, group_start_offset, idx); + re_insert_opcode (re_ctx_p, group_start_offset, re_get_group_start_opcode (is_capturing)); + + re_append_opcode (re_ctx_p, re_get_group_end_opcode (re_ctx_p, is_capturing)); + re_append_value (re_ctx_p, idx); + re_append_value (re_ctx_p, qmin); + re_append_value (re_ctx_p, qmax + RE_QMAX_OFFSET); +} /* re_insert_into_group */ /** - * Parse a max 3 digit long octal number from input string iterator. - * - * @return uint32_t - parsed octal number + * Insert simple atom iterator. */ -uint32_t -re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ +static void +re_insert_atom_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t start_offset) /**< atom start offset */ { - uint32_t number = 0; - for (int index = 0; - index < 3 - && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && lit_char_is_octal_digit (*parser_ctx_p->input_curr_p); - index++) + const uint32_t qmin = re_ctx_p->token.qmin; + const uint32_t qmax = re_ctx_p->token.qmax; + + if (qmin == 1 && qmax == 1) { - number = number * 8 + lit_char_hex_to_int (*parser_ctx_p->input_curr_p++); + return; } - return number; -} /* re_parse_octal */ + re_append_opcode (re_ctx_p, RE_OP_ITERATOR_END); + re_insert_value (re_ctx_p, start_offset, re_bytecode_size (re_ctx_p) - start_offset); + re_insert_value (re_ctx_p, start_offset, qmax + RE_QMAX_OFFSET); + re_insert_value (re_ctx_p, start_offset, qmin); + re_insert_opcode (re_ctx_p, start_offset, re_ctx_p->token.greedy ? RE_OP_GREEDY_ITERATOR : RE_OP_LAZY_ITERATOR); +} /* re_insert_atom_iterator */ /** - * Parse RegExp iterators - * - * @return empty ecma value - if parsed successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value + * Insert a lookahead assertion. */ -ecma_value_t -re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ - re_token_t *re_token_p) /**< [out] output token */ +static void +re_insert_assertion_lookahead (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t start_offset, /**< atom start offset */ + uint32_t capture_start, /**< index of first nested capture */ + bool negative) /** lookahead type */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; + const uint32_t qmin = re_ctx_p->token.qmin; + + re_append_opcode (re_ctx_p, RE_OP_ASSERT_END); + re_insert_value (re_ctx_p, start_offset, re_bytecode_size (re_ctx_p) - start_offset); + + /* We need to clear nested capturing group results when a negative assertion or the tail after a positive assertion + * does not match, so we store the begin and end index of nested capturing groups. */ + re_insert_value (re_ctx_p, start_offset, re_ctx_p->captures_count - capture_start); + re_insert_value (re_ctx_p, start_offset, capture_start); - re_token_p->qmin = 1; - re_token_p->qmax = 1; - re_token_p->greedy = true; + /* Lookaheads always result in zero length matches, which means iterations will always stop on the first match. + * This allows us to not have to deal with iterations beyond one. Either qmin == 0 which will implicitly match, + * or qmin > 0, in which case the first iteration will decide whether the assertion matches depending on whether + * the iteration matched or not. This also allows us to ignore qmax entirely. */ + re_insert_byte (re_ctx_p, start_offset, (uint8_t) JERRY_MIN (qmin, 1)); - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + const re_opcode_t opcode = (negative) ? RE_OP_ASSERT_LOOKAHEAD_NEG : RE_OP_ASSERT_LOOKAHEAD_POS; + re_insert_opcode (re_ctx_p, start_offset, opcode); +} /* re_insert_assertion_lookahead */ + +/** + * Consume non greedy (question mark) character if present. + */ +static void +re_parse_lazy_char (re_compiler_ctx_t *re_ctx_p) /**< RegExp parser context */ +{ + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p + && *re_ctx_p->input_curr_p == LIT_CHAR_QUESTION) { - return ret_value; + re_ctx_p->input_curr_p++; + re_ctx_p->token.greedy = false; + return; } - ecma_char_t ch = *parser_ctx_p->input_curr_p; + re_ctx_p->token.greedy = true; +} /* re_parse_lazy_char */ - switch (ch) +/** + * Parse a max 3 digit long octal number from the input string, with a decimal value less than 256. + * + * @return value of the octal number + */ +static uint32_t +re_parse_octal (re_compiler_ctx_t *re_ctx_p) /**< RegExp parser context */ +{ + JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p); + JERRY_ASSERT (lit_char_is_octal_digit (*re_ctx_p->input_curr_p)); + + uint32_t value = (uint32_t) (*re_ctx_p->input_curr_p++) - LIT_CHAR_0; + + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p + && lit_char_is_octal_digit (*re_ctx_p->input_curr_p)) { - case LIT_CHAR_QUESTION: - { - parser_ctx_p->input_curr_p++; - re_token_p->qmin = 0; - re_token_p->qmax = 1; - re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); - break; - } - case LIT_CHAR_ASTERISK: - { - parser_ctx_p->input_curr_p++; - re_token_p->qmin = 0; - re_token_p->qmax = RE_ITERATOR_INFINITE; - re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); - break; - } - case LIT_CHAR_PLUS: + value = value * 8 + (*re_ctx_p->input_curr_p++) - LIT_CHAR_0; + } + + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p + && lit_char_is_octal_digit (*re_ctx_p->input_curr_p)) + { + const uint32_t new_value = value * 8 + (*re_ctx_p->input_curr_p) - LIT_CHAR_0; + + if (new_value <= RE_MAX_OCTAL_VALUE) { - parser_ctx_p->input_curr_p++; - re_token_p->qmin = 1; - re_token_p->qmax = RE_ITERATOR_INFINITE; - re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); - break; + value = new_value; + re_ctx_p->input_curr_p++; } - case LIT_CHAR_LEFT_BRACE: + } + + return value; +} /* re_parse_octal */ + +/** + * Check that the currently parsed quantifier is valid. + * + * @return ECMA_VALUE_ERROR, if quantifier is invalid + * ECMA_VALUE_EMPTY, otherwise + */ +static ecma_value_t +re_check_quantifier (re_compiler_ctx_t *re_ctx_p) +{ + if (re_ctx_p->token.qmin > re_ctx_p->token.qmax) + { + /* ECMA-262 v5.1 15.10.2.5 */ + return ecma_raise_syntax_error (ECMA_ERR_MSG ("quantifier error: min > max.")); + } + + return ECMA_VALUE_EMPTY; +} /* re_check_quantifier */ + +/** + * Parse RegExp quantifier. + * + * @return ECMA_VALUE_TRUE - if parsed successfully + * ECMA_VALUE_FALSE - otherwise + */ +static ecma_value_t +re_parse_quantifier (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */ +{ + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p) + { + switch (*re_ctx_p->input_curr_p) { - parser_ctx_p->input_curr_p++; - uint32_t qmin = 0; - uint32_t qmax = RE_ITERATOR_INFINITE; - uint32_t digits = 0; + case LIT_CHAR_QUESTION: + { + re_ctx_p->input_curr_p++; + re_ctx_p->token.qmin = 0; + re_ctx_p->token.qmax = 1; - while (true) + re_parse_lazy_char (re_ctx_p); + return ECMA_VALUE_TRUE; + } + case LIT_CHAR_ASTERISK: { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier")); - } + re_ctx_p->input_curr_p++; + re_ctx_p->token.qmin = 0; + re_ctx_p->token.qmax = RE_INFINITY; + + re_parse_lazy_char (re_ctx_p); + return ECMA_VALUE_TRUE; + } + case LIT_CHAR_PLUS: + { + re_ctx_p->input_curr_p++; + re_ctx_p->token.qmin = 1; + re_ctx_p->token.qmax = RE_INFINITY; - ch = *parser_ctx_p->input_curr_p++; + re_parse_lazy_char (re_ctx_p); + return ECMA_VALUE_TRUE; + } + case LIT_CHAR_LEFT_BRACE: + { + const lit_utf8_byte_t *current_p = re_ctx_p->input_curr_p + 1; + uint32_t qmin = 0; + uint32_t qmax = RE_INFINITY; - if (lit_char_is_decimal_digit (ch)) + if (current_p >= re_ctx_p->input_end_p) { - if (digits >= ECMA_NUMBER_MAX_DIGITS) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: too many digits.")); - } - digits++; - qmin = qmin * 10 + lit_char_hex_to_int (ch); + break; } - else if (ch == LIT_CHAR_COMMA) + + if (!lit_char_is_decimal_digit (*current_p)) { - if (qmax != RE_ITERATOR_INFINITE) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: double comma.")); - } + break; + } - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier")); - } + qmin = lit_parse_decimal (¤t_p, re_ctx_p->input_end_p); - if (*parser_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE) - { - if (digits == 0) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits.")); - } + if (current_p >= re_ctx_p->input_end_p) + { + break; + } - parser_ctx_p->input_curr_p++; - re_token_p->qmin = qmin; - re_token_p->qmax = RE_ITERATOR_INFINITE; - break; - } + lit_utf8_byte_t ch = *current_p++; + if (ch == LIT_CHAR_RIGHT_BRACE) + { qmax = qmin; - qmin = 0; - digits = 0; } - else if (ch == LIT_CHAR_RIGHT_BRACE) + else if (ch == LIT_CHAR_COMMA) { - if (digits == 0) + if (current_p >= re_ctx_p->input_end_p) { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits.")); + break; } - if (qmax != RE_ITERATOR_INFINITE) + if (lit_char_is_decimal_digit (*current_p)) { - re_token_p->qmin = qmax; + qmax = lit_parse_decimal (¤t_p, re_ctx_p->input_end_p); } - else + + if (current_p >= re_ctx_p->input_end_p || *current_p++ != LIT_CHAR_RIGHT_BRACE) { - re_token_p->qmin = qmin; + break; } - - re_token_p->qmax = qmin; - - break; } else { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: unknown char.")); + break; } - } - re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p); - break; - } - default: - { - break; + re_ctx_p->token.qmin = qmin; + re_ctx_p->token.qmax = qmax; + re_ctx_p->input_curr_p = current_p; + re_parse_lazy_char (re_ctx_p); + return ECMA_VALUE_TRUE; + } + default: + { + break; + } } } - JERRY_ASSERT (ecma_is_value_empty (ret_value)); + re_ctx_p->token.qmin = 1; + re_ctx_p->token.qmax = 1; + re_ctx_p->token.greedy = true; - return ret_value; -} /* re_parse_iterator */ + return ECMA_VALUE_FALSE; +} /* re_parse_quantifier */ /** - * Count the number of groups in pattern + * Count the number of groups in the current pattern. */ static void -re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ +re_count_groups (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */ { - int char_class_in = 0; - parser_ctx_p->groups_count = 0; - const lit_utf8_byte_t *curr_p = parser_ctx_p->input_start_p; + bool is_char_class = 0; + re_ctx_p->groups_count = 0; + const lit_utf8_byte_t *curr_p = re_ctx_p->input_start_p; - while (curr_p < parser_ctx_p->input_end_p) + while (curr_p < re_ctx_p->input_end_p) { switch (*curr_p++) { case LIT_CHAR_BACKSLASH: { - if (curr_p < parser_ctx_p->input_end_p) + if (curr_p < re_ctx_p->input_end_p) { lit_utf8_incr (&curr_p); } @@ -270,324 +372,424 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex } case LIT_CHAR_LEFT_SQUARE: { - char_class_in++; + is_char_class = true; break; } case LIT_CHAR_RIGHT_SQUARE: { - if (char_class_in) - { - char_class_in--; - } + is_char_class = false; break; } case LIT_CHAR_LEFT_PAREN: { - if (curr_p < parser_ctx_p->input_end_p + if (curr_p < re_ctx_p->input_end_p && *curr_p != LIT_CHAR_QUESTION - && !char_class_in) + && !is_char_class) { - parser_ctx_p->groups_count++; + re_ctx_p->groups_count++; } break; } } } -} /* re_count_num_of_groups */ +} /* re_count_groups */ +#if ENABLED (JERRY_ES2015) /** - * Read the input pattern and parse the next token for the RegExp compiler + * Check if a code point is a Syntax character * - * @return empty ecma value - if parsed successfully - * error ecma value - otherwise + * @return true, if syntax character + * false, otherwise + */ +static bool +re_is_syntax_char (lit_code_point_t cp) /**< code point */ +{ + return (cp == LIT_CHAR_CIRCUMFLEX + || cp == LIT_CHAR_DOLLAR_SIGN + || cp == LIT_CHAR_BACKSLASH + || cp == LIT_CHAR_DOT + || cp == LIT_CHAR_ASTERISK + || cp == LIT_CHAR_PLUS + || cp == LIT_CHAR_QUESTION + || cp == LIT_CHAR_LEFT_PAREN + || cp == LIT_CHAR_RIGHT_PAREN + || cp == LIT_CHAR_LEFT_SQUARE + || cp == LIT_CHAR_RIGHT_SQUARE + || cp == LIT_CHAR_LEFT_BRACE + || cp == LIT_CHAR_RIGHT_BRACE + || cp == LIT_CHAR_VLINE); +} /* re_is_syntax_char */ +#endif /* ENABLED (JERRY_ES2015) */ + +/** + * Parse a Character Escape or a Character Class Escape. * - * Returned value must be freed with ecma_free_value + * @return ECMA_VALUE_EMPTY, if parsed successfully + * ECMA_VALUE_ERROR, otherwise */ -ecma_value_t -re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ - re_token_t *out_token_p) /**< [out] output token */ +static ecma_value_t +re_parse_char_escape (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; + JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p); + re_ctx_p->token.type = RE_TOK_CHAR; - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + if (lit_char_is_decimal_digit (*re_ctx_p->input_curr_p)) { - out_token_p->type = RE_TOK_EOF; - return ret_value; - } + /* NULL code point escape, only valid if there are no following digits. */ + if (*re_ctx_p->input_curr_p == LIT_CHAR_0 + && (re_ctx_p->input_curr_p + 1 >= re_ctx_p->input_end_p + || !lit_char_is_decimal_digit (re_ctx_p->input_curr_p[1]))) + { + re_ctx_p->input_curr_p++; + re_ctx_p->token.value = LIT_UNICODE_CODE_POINT_NULL; + return ECMA_VALUE_EMPTY; + } + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape sequence")); + } +#endif /* ENABLED (JERRY_ES2015) */ + + /* Legacy octal escape sequence */ + if (lit_char_is_octal_digit (*re_ctx_p->input_curr_p)) + { + re_ctx_p->token.value = re_parse_octal (re_ctx_p); + return ECMA_VALUE_EMPTY; + } - ecma_char_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); + /* Identity escape */ + re_ctx_p->token.value = *re_ctx_p->input_curr_p++; + return ECMA_VALUE_EMPTY; + } + lit_code_point_t ch = lit_cesu8_read_next (&re_ctx_p->input_curr_p); switch (ch) { - case LIT_CHAR_VLINE: + /* Character Class escapes */ + case LIT_CHAR_LOWERCASE_D: { - out_token_p->type = RE_TOK_ALTERNATIVE; + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_DIGIT; break; } - case LIT_CHAR_CIRCUMFLEX: + case LIT_CHAR_UPPERCASE_D: { - out_token_p->type = RE_TOK_ASSERT_START; + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_NOT_DIGIT; break; } - case LIT_CHAR_DOLLAR_SIGN: + case LIT_CHAR_LOWERCASE_S: { - out_token_p->type = RE_TOK_ASSERT_END; + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_WHITESPACE; break; } - case LIT_CHAR_DOT: + case LIT_CHAR_UPPERCASE_S: { - out_token_p->type = RE_TOK_PERIOD; - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_NOT_WHITESPACE; break; } - case LIT_CHAR_BACKSLASH: + case LIT_CHAR_LOWERCASE_W: + { + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_WORD_CHAR; + break; + } + case LIT_CHAR_UPPERCASE_W: + { + re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE; + re_ctx_p->token.value = RE_ESCAPE_NOT_WORD_CHAR; + break; + } + /* Control escapes */ + case LIT_CHAR_LOWERCASE_F: { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + re_ctx_p->token.value = LIT_CHAR_FF; + break; + } + case LIT_CHAR_LOWERCASE_N: + { + re_ctx_p->token.value = LIT_CHAR_LF; + break; + } + case LIT_CHAR_LOWERCASE_R: + { + re_ctx_p->token.value = LIT_CHAR_CR; + break; + } + case LIT_CHAR_LOWERCASE_T: + { + re_ctx_p->token.value = LIT_CHAR_TAB; + break; + } + case LIT_CHAR_LOWERCASE_V: + { + re_ctx_p->token.value = LIT_CHAR_VTAB; + break; + } + /* Control letter */ + case LIT_CHAR_LOWERCASE_C: + { + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p) { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid regular experssion")); - } + ch = *re_ctx_p->input_curr_p; - out_token_p->type = RE_TOK_CHAR; - ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); - - if (ch == LIT_CHAR_LOWERCASE_B) - { - out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY; - } - else if (ch == LIT_CHAR_UPPERCASE_B) - { - out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY; - } - else if (ch == LIT_CHAR_LOWERCASE_F) - { - out_token_p->value = LIT_CHAR_FF; - } - else if (ch == LIT_CHAR_LOWERCASE_N) - { - out_token_p->value = LIT_CHAR_LF; - } - else if (ch == LIT_CHAR_LOWERCASE_T) - { - out_token_p->value = LIT_CHAR_TAB; - } - else if (ch == LIT_CHAR_LOWERCASE_R) - { - out_token_p->value = LIT_CHAR_CR; - } - else if (ch == LIT_CHAR_LOWERCASE_V) - { - out_token_p->value = LIT_CHAR_VTAB; - } - else if (ch == LIT_CHAR_LOWERCASE_C) - { - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) + if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) + || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)) { - ch = *parser_ctx_p->input_curr_p; + re_ctx_p->token.value = (ch % 32); + re_ctx_p->input_curr_p++; - if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) - || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)) - { - out_token_p->value = (ch % 32); - parser_ctx_p->input_curr_p++; - } - else - { - out_token_p->value = LIT_CHAR_BACKSLASH; - parser_ctx_p->input_curr_p--; - } - } - else - { - out_token_p->value = LIT_CHAR_BACKSLASH; - parser_ctx_p->input_curr_p--; + break; } } - else if (ch == LIT_CHAR_LOWERCASE_X - && re_hex_lookup (parser_ctx_p, 2)) - { - ecma_char_t code_unit; - - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error")); - } - parser_ctx_p->input_curr_p += 2; - out_token_p->value = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_U - && re_hex_lookup (parser_ctx_p, 4)) +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) { - ecma_char_t code_unit; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid control escape sequence")); + } +#endif /* ENABLED (JERRY_ES2015) */ - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error")); - } + re_ctx_p->token.value = LIT_CHAR_BACKSLASH; + re_ctx_p->input_curr_p--; - parser_ctx_p->input_curr_p += 4; - out_token_p->value = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_D) - { - out_token_p->type = RE_TOK_DIGIT; - break; - } - else if (ch == LIT_CHAR_UPPERCASE_D) - { - out_token_p->type = RE_TOK_NOT_DIGIT; - break; - } - else if (ch == LIT_CHAR_LOWERCASE_S) - { - out_token_p->type = RE_TOK_WHITE; - break; - } - else if (ch == LIT_CHAR_UPPERCASE_S) - { - out_token_p->type = RE_TOK_NOT_WHITE; - break; - } - else if (ch == LIT_CHAR_LOWERCASE_W) + break; + } + /* Hex escape */ + case LIT_CHAR_LOWERCASE_X: + { + uint32_t hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p, re_ctx_p->input_end_p, 2); + if (hex_value != UINT32_MAX) { - out_token_p->type = RE_TOK_WORD_CHAR; + re_ctx_p->token.value = hex_value; + re_ctx_p->input_curr_p += 2; break; } - else if (ch == LIT_CHAR_UPPERCASE_W) + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) { - out_token_p->type = RE_TOK_NOT_WORD_CHAR; - break; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid hex escape sequence")); } - else if (lit_char_is_decimal_digit (ch)) +#endif /* ENABLED (JERRY_ES2015) */ + + re_ctx_p->token.value = LIT_CHAR_LOWERCASE_X; + break; + } + /* Unicode escape */ + case LIT_CHAR_LOWERCASE_U: + { + uint32_t hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p, re_ctx_p->input_end_p, 4); + if (hex_value != UINT32_MAX) { - if (ch == LIT_CHAR_0) + re_ctx_p->token.value = hex_value; + re_ctx_p->input_curr_p += 4; + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE + && lit_is_code_point_utf16_high_surrogate (re_ctx_p->token.value) + && re_ctx_p->input_curr_p + 6 <= re_ctx_p->input_end_p + && re_ctx_p->input_curr_p[0] == '\\' + && re_ctx_p->input_curr_p[1] == 'u') { - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && lit_char_is_decimal_digit (*parser_ctx_p->input_curr_p)) + hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p + 2, re_ctx_p->input_end_p, 4); + if (lit_is_code_point_utf16_low_surrogate (hex_value)) { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape pattern error.")); + re_ctx_p->token.value = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) re_ctx_p->token.value, + (ecma_char_t) hex_value); + re_ctx_p->input_curr_p += 6; } - - out_token_p->value = LIT_UNICODE_CODE_POINT_NULL; } - else +#endif /* ENABLED (JERRY_ES2015) */ + + break; + } + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + if (*re_ctx_p->input_curr_p == LIT_CHAR_LEFT_BRACE) { - if (parser_ctx_p->groups_count == -1) - { - re_count_num_of_groups (parser_ctx_p); - } + re_ctx_p->input_curr_p++; - if (parser_ctx_p->groups_count) + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && lit_char_is_hex_digit (*re_ctx_p->input_curr_p)) { - parser_ctx_p->input_curr_p--; - uint32_t number = 0; - int index = 0; + lit_code_point_t cp = lit_char_hex_to_int (*re_ctx_p->input_curr_p++); - do + while (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && lit_char_is_hex_digit (*re_ctx_p->input_curr_p)) { - if (index >= RE_MAX_RE_DECESC_DIGITS) - { - ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape error: decimal escape too long.")); - return ret_value; - } - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - break; - } - - ecma_char_t digit = *parser_ctx_p->input_curr_p++; + cp = cp * 16 + lit_char_hex_to_int (*re_ctx_p->input_curr_p++); - if (!lit_char_is_decimal_digit (digit)) + if (JERRY_UNLIKELY (cp > LIT_UNICODE_CODE_POINT_MAX)) { - parser_ctx_p->input_curr_p--; - break; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid unicode escape sequence")); } - number = number * 10 + lit_char_hex_to_int (digit); - index++; } - while (true); - if ((int) number <= parser_ctx_p->groups_count) + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && *re_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE) { - out_token_p->type = RE_TOK_BACKREFERENCE; + re_ctx_p->input_curr_p++; + re_ctx_p->token.value = cp; + break; } - else - /* Invalid backreference, fallback to octal */ - { - /* Rewind to start of number. */ - parser_ctx_p->input_curr_p -= index; + } + } - /* Try to reparse as octal. */ - ecma_char_t digit = *parser_ctx_p->input_curr_p; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid unicode escape sequence")); + } +#endif /* ENABLED (JERRY_ES2015) */ - if (!lit_char_is_octal_digit (digit)) - { - /* Not octal, keep digit character value. */ - number = digit; - parser_ctx_p->input_curr_p++; - } - else - { - number = re_parse_octal (parser_ctx_p); - } - } - out_token_p->value = number; - } - else - /* Invalid backreference, fallback to octal if possible */ - { - if (!lit_char_is_octal_digit (ch)) - { - /* Not octal, keep character value. */ - out_token_p->value = ch; - } - else - { - parser_ctx_p->input_curr_p--; - out_token_p->value = re_parse_octal (parser_ctx_p); - } - } + re_ctx_p->token.value = LIT_CHAR_LOWERCASE_U; + break; + } + /* Identity escape */ + default: + { +#if ENABLED (JERRY_ES2015) + /* Must be '/', or one of SyntaxCharacter */ + if (re_ctx_p->flags & RE_FLAG_UNICODE + && ch != LIT_CHAR_SLASH + && !re_is_syntax_char (ch)) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape")); + } +#endif /* ENABLED (JERRY_ES2015) */ + re_ctx_p->token.value = ch; + } + } + + return ECMA_VALUE_EMPTY; +} /* re_parse_char_escape */ + +/** + * Read the input pattern and parse the next token for the RegExp compiler + * + * @return empty ecma value - if parsed successfully + * error ecma value - otherwise + * + * Returned value must be freed with ecma_free_value + */ +static ecma_value_t +re_parse_next_token (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */ +{ + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) + { + re_ctx_p->token.type = RE_TOK_EOF; + return ECMA_VALUE_EMPTY; + } + + ecma_char_t ch = lit_cesu8_read_next (&re_ctx_p->input_curr_p); + + switch (ch) + { + case LIT_CHAR_CIRCUMFLEX: + { + re_ctx_p->token.type = RE_TOK_ASSERT_START; + return ECMA_VALUE_EMPTY; + } + case LIT_CHAR_DOLLAR_SIGN: + { + re_ctx_p->token.type = RE_TOK_ASSERT_END; + return ECMA_VALUE_EMPTY; + } + case LIT_CHAR_VLINE: + { + re_ctx_p->token.type = RE_TOK_ALTERNATIVE; + return ECMA_VALUE_EMPTY; + } + case LIT_CHAR_DOT: + { + re_ctx_p->token.type = RE_TOK_PERIOD; + /* Check quantifier */ + break; + } + case LIT_CHAR_BACKSLASH: + { + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape")); + } + + /* DecimalEscape, Backreferences cannot start with a zero digit. */ + if (*re_ctx_p->input_curr_p > LIT_CHAR_0 && *re_ctx_p->input_curr_p <= LIT_CHAR_9) + { + const lit_utf8_byte_t *digits_p = re_ctx_p->input_curr_p; + const uint32_t value = lit_parse_decimal (&digits_p, re_ctx_p->input_end_p); + + if (re_ctx_p->groups_count < 0) + { + re_count_groups (re_ctx_p); + } + + if (value <= (uint32_t) re_ctx_p->groups_count) + { + /* Valid backreference */ + re_ctx_p->input_curr_p = digits_p; + re_ctx_p->token.type = RE_TOK_BACKREFERENCE; + re_ctx_p->token.value = value; + + /* Check quantifier */ + break; } } - else + + if (*re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_B) + { + re_ctx_p->input_curr_p++; + re_ctx_p->token.type = RE_TOK_ASSERT_WORD_BOUNDARY; + return ECMA_VALUE_EMPTY; + } + else if (*re_ctx_p->input_curr_p == LIT_CHAR_UPPERCASE_B) + { + re_ctx_p->input_curr_p++; + re_ctx_p->token.type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY; + return ECMA_VALUE_EMPTY; + } + + const ecma_value_t parse_result = re_parse_char_escape (re_ctx_p); + + if (ECMA_IS_VALUE_ERROR (parse_result)) { - out_token_p->value = ch; + return parse_result; } - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); + /* Check quantifier */ break; } case LIT_CHAR_LEFT_PAREN: { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) { return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated group")); } - if (*parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION) + if (*re_ctx_p->input_curr_p == LIT_CHAR_QUESTION) { - parser_ctx_p->input_curr_p++; - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + re_ctx_p->input_curr_p++; + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) { return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid group")); } - ch = *parser_ctx_p->input_curr_p++; + ch = *re_ctx_p->input_curr_p++; if (ch == LIT_CHAR_EQUALS) { - /* (?= */ - out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD; + re_ctx_p->token.type = RE_TOK_ASSERT_LOOKAHEAD; + re_ctx_p->token.value = false; } else if (ch == LIT_CHAR_EXCLAMATION) { - /* (?! */ - out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD; + re_ctx_p->token.type = RE_TOK_ASSERT_LOOKAHEAD; + re_ctx_p->token.value = true; } else if (ch == LIT_CHAR_COLON) { - /* (?: */ - out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP; + re_ctx_p->token.type = RE_TOK_START_NON_CAPTURE_GROUP; } else { @@ -596,104 +798,583 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } else { - /* ( */ - out_token_p->type = RE_TOK_START_CAPTURE_GROUP; + re_ctx_p->token.type = RE_TOK_START_CAPTURE_GROUP; } - break; + + return ECMA_VALUE_EMPTY; } case LIT_CHAR_RIGHT_PAREN: { - out_token_p->type = RE_TOK_END_GROUP; - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); - break; + re_ctx_p->token.type = RE_TOK_END_GROUP; + + return ECMA_VALUE_EMPTY; } case LIT_CHAR_LEFT_SQUARE: { - out_token_p->type = RE_TOK_START_CHAR_CLASS; - - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class")); - } + re_ctx_p->token.type = RE_TOK_CHAR_CLASS; - if (*parser_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX) + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) { - out_token_p->type = RE_TOK_START_INV_CHAR_CLASS; - parser_ctx_p->input_curr_p++; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated character class.")); } - break; + return ECMA_VALUE_EMPTY; } case LIT_CHAR_QUESTION: case LIT_CHAR_ASTERISK: case LIT_CHAR_PLUS: { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token.")); - } - case LIT_CHAR_NULL: - { - out_token_p->type = RE_TOK_EOF; - break; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid quantifier.")); } case LIT_CHAR_LEFT_BRACE: { -#if ENABLED (JERRY_REGEXP_STRICT_MODE) - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token.")); -#else /* !ENABLED (JERRY_REGEXP_STRICT_MODE) */ - - /* Make sure that the current '{' does not start an iterator. - * - * E.g: /\s+{3,4}/ should fail as there is nothing to iterate. - * However /\s+{3,4/ should be valid in web compatibility mode. - */ - const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p; + re_ctx_p->input_curr_p--; + if (ecma_is_value_true (re_parse_quantifier (re_ctx_p))) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Nothing to repeat.")); + } - lit_utf8_decr (&parser_ctx_p->input_curr_p); - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); - if (ecma_is_value_empty (ret_value)) +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token.")); + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Lone quantifier bracket.")); } +#endif /* ENABLED (JERRY_ES2015) */ + + re_ctx_p->input_curr_p++; + re_ctx_p->token.type = RE_TOK_CHAR; + re_ctx_p->token.value = ch; - JERRY_ASSERT (ECMA_IS_VALUE_ERROR (ret_value)); - jcontext_release_exception (); + /* Check quantifier */ + break; + } +#if ENABLED (JERRY_ES2015) + case LIT_CHAR_RIGHT_SQUARE: + case LIT_CHAR_RIGHT_BRACE: + { + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Lone quantifier bracket.")); + } - parser_ctx_p->input_curr_p = input_curr_p; - /* It was not an iterator, continue the parsing. */ -#endif /* ENABLED (JERRY_REGEXP_STRICT_MODE) */ /* FALLTHRU */ } +#endif /* ENABLED (JERRY_ES2015) */ default: { - out_token_p->type = RE_TOK_CHAR; - out_token_p->value = ch; -#if ENABLED (JERRY_REGEXP_STRICT_MODE) - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); -#else - /* In case of compatiblity mode try the following: - * 1. Try parsing an iterator after the character. - * 2.a. If no error is reported: it was an iterator so return an empty value. - * 2.b. If there was an error: it was not an iterator thus return the current position - * to the start of the iterator parsing and set the return value to the empty value. - * 3. The next 're_parse_next_token' call will handle the further parsing of characters. - */ - const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p; - ret_value = re_parse_iterator (parser_ctx_p, out_token_p); - - if (!ecma_is_value_empty (ret_value)) - { - jcontext_release_exception (); - parser_ctx_p->input_curr_p = input_curr_p; - ret_value = ECMA_VALUE_EMPTY; - } -#endif + re_ctx_p->token.type = RE_TOK_CHAR; + re_ctx_p->token.value = ch; + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE + && lit_is_code_point_utf16_high_surrogate (ch)) + { + const ecma_char_t next = lit_cesu8_peek_next (re_ctx_p->input_curr_p); + if (lit_is_code_point_utf16_low_surrogate (next)) + { + re_ctx_p->token.value = lit_convert_surrogate_pair_to_code_point (ch, next); + re_ctx_p->input_curr_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; + } + } +#endif /* ENABLED (JERRY_ES2015) */ + + /* Check quantifier */ break; } } - return ret_value; + re_parse_quantifier (re_ctx_p); + return re_check_quantifier (re_ctx_p); } /* re_parse_next_token */ +/** + * Append a character class range to the bytecode. + */ +static void +re_class_add_range (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + lit_code_point_t start, /**< range begin */ + lit_code_point_t end) /**< range end */ +{ + if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE) + { + start = ecma_regexp_canonicalize_char (start, re_ctx_p->flags & RE_FLAG_UNICODE); + end = ecma_regexp_canonicalize_char (end, re_ctx_p->flags & RE_FLAG_UNICODE); + } + + re_append_char (re_ctx_p, start); + re_append_char (re_ctx_p, end); +} /* re_class_add_range */ + +/** + * Add a single character to the character class + */ +static void +re_class_add_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t class_offset, /**< character class bytecode offset*/ + lit_code_point_t cp) /**< code point */ +{ + if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE) + { + cp = ecma_regexp_canonicalize_char (cp, re_ctx_p->flags & RE_FLAG_UNICODE); + } + + re_insert_char (re_ctx_p, class_offset, cp); +} /* re_class_add_char */ + +/** + * Invalid character code point + */ +#define RE_INVALID_CP 0xFFFFFFFF + +/** + * Read the input pattern and parse the range of character class + * + * @return empty ecma value - if parsed successfully + * error ecma value - otherwise + * + * Returned value must be freed with ecma_free_value + */ +static ecma_value_t +re_parse_char_class (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */ +{ + static const uint8_t escape_flags[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20}; + const uint32_t class_offset = re_bytecode_size (re_ctx_p); + + uint8_t found_escape_flags = 0; + uint8_t out_class_flags = 0; + + uint32_t range_count = 0; + uint32_t char_count = 0; + bool is_range = false; + + JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p); + if (*re_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX) + { + re_ctx_p->input_curr_p++; + out_class_flags |= RE_CLASS_INVERT; + } + + lit_code_point_t start = RE_INVALID_CP; + + while (true) + { + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated character class.")); + } + + if (*re_ctx_p->input_curr_p == LIT_CHAR_RIGHT_SQUARE) + { + if (is_range) + { + if (start != RE_INVALID_CP) + { + re_class_add_char (re_ctx_p, class_offset, start); + char_count++; + } + + re_class_add_char (re_ctx_p, class_offset, LIT_CHAR_MINUS); + char_count++; + } + + re_ctx_p->input_curr_p++; + break; + } + + JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p); + lit_code_point_t current; + + if (*re_ctx_p->input_curr_p == LIT_CHAR_BACKSLASH) + { + re_ctx_p->input_curr_p++; + if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape")); + } + + if (*re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_B) + { + re_ctx_p->input_curr_p++; + current = LIT_CHAR_BS; + } +#if ENABLED (JERRY_ES2015) + else if (*re_ctx_p->input_curr_p == LIT_CHAR_MINUS) + { + re_ctx_p->input_curr_p++; + current = LIT_CHAR_MINUS; + } +#endif /* ENABLED (JERRY_ES2015) */ + else if ((re_ctx_p->flags & RE_FLAG_UNICODE) == 0 + && *re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_C + && re_ctx_p->input_curr_p + 1 < re_ctx_p->input_end_p + && (lit_char_is_decimal_digit (*(re_ctx_p->input_curr_p + 1)) + || *(re_ctx_p->input_curr_p + 1) == LIT_CHAR_UNDERSCORE)) + { + current = ((uint8_t) *(re_ctx_p->input_curr_p + 1) % 32); + re_ctx_p->input_curr_p += 2; + } + else + { + if (ECMA_IS_VALUE_ERROR (re_parse_char_escape (re_ctx_p))) + { + return ECMA_VALUE_ERROR; + } + + if (re_ctx_p->token.type == RE_TOK_CLASS_ESCAPE) + { + const uint8_t escape = (uint8_t) re_ctx_p->token.value; + found_escape_flags |= escape_flags[escape]; + current = RE_INVALID_CP; + } + else + { + JERRY_ASSERT (re_ctx_p->token.type == RE_TOK_CHAR); + current = re_ctx_p->token.value; + } + } + } +#if ENABLED (JERRY_ES2015) + else if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + current = ecma_regexp_unicode_advance (&re_ctx_p->input_curr_p, re_ctx_p->input_end_p); + } +#endif /* ENABLED (JERRY_ES2015) */ + else + { + current = lit_cesu8_read_next (&re_ctx_p->input_curr_p); + } + + if (is_range) + { + is_range = false; + + if (start != RE_INVALID_CP && current != RE_INVALID_CP) + { + if (start > current) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Range out of order in character class")); + } + + re_class_add_range (re_ctx_p, start, current); + range_count++; + continue; + } + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid character class")); + } +#endif /* ENABLED (JERRY_ES2015) */ + + if (start != RE_INVALID_CP) + { + re_class_add_char (re_ctx_p, class_offset, start); + char_count++; + } + else if (current != RE_INVALID_CP) + { + re_class_add_char (re_ctx_p, class_offset, current); + char_count++; + } + + re_class_add_char (re_ctx_p, class_offset, LIT_CHAR_MINUS); + char_count++; + continue; + } + + if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p + && *re_ctx_p->input_curr_p == LIT_CHAR_MINUS) + { + re_ctx_p->input_curr_p++; + start = current; + is_range = true; + continue; + } + + if (current != RE_INVALID_CP) + { + re_class_add_char (re_ctx_p, class_offset, current); + char_count++; + } + } + + uint8_t escape_count = 0; + for (ecma_class_escape_t escape = RE_ESCAPE__START; escape < RE_ESCAPE__COUNT; ++escape) + { + if (found_escape_flags & escape_flags[escape]) + { + re_insert_byte (re_ctx_p, class_offset, (uint8_t) escape); + escape_count++; + } + } + + if (range_count > 0) + { + re_insert_value (re_ctx_p, class_offset, range_count); + out_class_flags |= RE_CLASS_HAS_RANGES; + } + + if (char_count > 0) + { + re_insert_value (re_ctx_p, class_offset, char_count); + out_class_flags |= RE_CLASS_HAS_CHARS; + } + + JERRY_ASSERT (escape_count <= RE_CLASS_ESCAPE_COUNT_MASK); + out_class_flags |= escape_count; + + re_insert_byte (re_ctx_p, class_offset, out_class_flags); + re_insert_opcode (re_ctx_p, class_offset, RE_OP_CHAR_CLASS); + + re_parse_quantifier (re_ctx_p); + return re_check_quantifier (re_ctx_p); +} /* re_parse_char_class */ + +/** + * Parse alternatives + * + * @return empty ecma value - if alternative was successfully parsed + * error ecma value - otherwise + * + * Returned value must be freed with ecma_free_value + */ +ecma_value_t +re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool expect_eof) /**< expect end of file */ +{ + ECMA_CHECK_STACK_USAGE (); + uint32_t alternative_offset = re_bytecode_size (re_ctx_p); + bool first_alternative = true; + + while (true) + { + ecma_value_t next_token_result = re_parse_next_token (re_ctx_p); + if (ECMA_IS_VALUE_ERROR (next_token_result)) + { + return next_token_result; + } + + JERRY_ASSERT (ecma_is_value_empty (next_token_result)); + + uint32_t atom_offset = re_bytecode_size (re_ctx_p); + + switch (re_ctx_p->token.type) + { + case RE_TOK_START_CAPTURE_GROUP: + { + const uint32_t idx = re_ctx_p->captures_count++; + const uint32_t capture_start = idx; + + ecma_value_t result = re_parse_alternative (re_ctx_p, false); + if (ECMA_IS_VALUE_ERROR (result)) + { + return result; + } + + re_parse_quantifier (re_ctx_p); + + if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p))) + { + return ECMA_VALUE_ERROR; + } + + re_insert_into_group (re_ctx_p, atom_offset, idx, capture_start, true); + break; + } + case RE_TOK_START_NON_CAPTURE_GROUP: + { + const uint32_t idx = re_ctx_p->non_captures_count++; + const uint32_t capture_start = re_ctx_p->captures_count; + + ecma_value_t result = re_parse_alternative (re_ctx_p, false); + if (ECMA_IS_VALUE_ERROR (result)) + { + return result; + } + + re_parse_quantifier (re_ctx_p); + + if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p))) + { + return ECMA_VALUE_ERROR; + } + + re_insert_into_group (re_ctx_p, atom_offset, idx, capture_start, false); + break; + } + case RE_TOK_PERIOD: + { +#if ENABLED (JERRY_ES2015) + re_append_opcode (re_ctx_p, (re_ctx_p->flags & RE_FLAG_UNICODE) ? RE_OP_UNICODE_PERIOD : RE_OP_PERIOD); +#else /* !ENABLED (JERRY_ES2015) */ + re_append_opcode (re_ctx_p, RE_OP_PERIOD); +#endif /* !ENABLED (JERRY_ES2015) */ + + re_insert_atom_iterator (re_ctx_p, atom_offset); + break; + } + case RE_TOK_ALTERNATIVE: + { + re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset); + re_insert_opcode (re_ctx_p, alternative_offset, first_alternative ? RE_OP_ALTERNATIVE_START + : RE_OP_ALTERNATIVE_NEXT); + + alternative_offset = re_bytecode_size (re_ctx_p); + first_alternative = false; + break; + } + case RE_TOK_ASSERT_START: + { + re_append_opcode (re_ctx_p, RE_OP_ASSERT_LINE_START); + break; + } + case RE_TOK_ASSERT_END: + { + re_append_opcode (re_ctx_p, RE_OP_ASSERT_LINE_END); + break; + } + case RE_TOK_ASSERT_WORD_BOUNDARY: + { + re_append_opcode (re_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); + break; + } + case RE_TOK_ASSERT_NOT_WORD_BOUNDARY: + { + re_append_opcode (re_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); + break; + } + case RE_TOK_ASSERT_LOOKAHEAD: + { + const uint32_t start_capture_count = re_ctx_p->captures_count; + const bool is_negative = !!re_ctx_p->token.value; + + ecma_value_t result = re_parse_alternative (re_ctx_p, false); + + if (ECMA_IS_VALUE_ERROR (result)) + { + return result; + } + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + re_ctx_p->token.qmin = 1; + re_ctx_p->token.qmax = 1; + re_ctx_p->token.greedy = true; + } + else +#endif /* ENABLED (JERRY_ES2015) */ + { + re_parse_quantifier (re_ctx_p); + + if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p))) + { + return ECMA_VALUE_ERROR; + } + } + + re_insert_assertion_lookahead (re_ctx_p, atom_offset, start_capture_count, is_negative); + break; + } + case RE_TOK_BACKREFERENCE: + { + const uint32_t backref_idx = re_ctx_p->token.value; + re_append_opcode (re_ctx_p, RE_OP_BACKREFERENCE); + re_append_value (re_ctx_p, backref_idx); + + if (re_ctx_p->token.qmin != 1 || re_ctx_p->token.qmax != 1) + { + const uint32_t group_idx = re_ctx_p->non_captures_count++; + re_insert_into_group (re_ctx_p, atom_offset, group_idx, re_ctx_p->captures_count, false); + } + + break; + } + case RE_TOK_CLASS_ESCAPE: + { + const ecma_class_escape_t escape = (ecma_class_escape_t) re_ctx_p->token.value; + re_append_opcode (re_ctx_p, RE_OP_CLASS_ESCAPE); + re_append_byte (re_ctx_p, (uint8_t) escape); + + re_insert_atom_iterator (re_ctx_p, atom_offset); + break; + } + case RE_TOK_CHAR_CLASS: + { + ecma_value_t result = re_parse_char_class (re_ctx_p); + + if (ECMA_IS_VALUE_ERROR (result)) + { + return result; + } + + re_insert_atom_iterator (re_ctx_p, atom_offset); + break; + } + case RE_TOK_END_GROUP: + { + if (expect_eof) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unmatched ')'")); + } + + if (!first_alternative) + { + re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset); + re_insert_opcode (re_ctx_p, alternative_offset, RE_OP_ALTERNATIVE_NEXT); + } + + return ECMA_VALUE_EMPTY; + } + case RE_TOK_EOF: + { + if (!expect_eof) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern.")); + } + + if (!first_alternative) + { + re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset); + re_insert_opcode (re_ctx_p, alternative_offset, RE_OP_ALTERNATIVE_NEXT); + } + + re_append_opcode (re_ctx_p, RE_OP_EOF); + return ECMA_VALUE_EMPTY; + } + default: + { + JERRY_ASSERT (re_ctx_p->token.type == RE_TOK_CHAR); + + lit_code_point_t ch = re_ctx_p->token.value; + + if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && (re_ctx_p->flags & RE_FLAG_IGNORE_CASE) == 0) + { + re_append_opcode (re_ctx_p, RE_OP_BYTE); + re_append_byte (re_ctx_p, (uint8_t) ch); + + re_insert_atom_iterator (re_ctx_p, atom_offset); + break; + } + + if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE) + { + ch = ecma_regexp_canonicalize_char (ch, re_ctx_p->flags & RE_FLAG_UNICODE); + } + + re_append_opcode (re_ctx_p, RE_OP_CHAR); + re_append_char (re_ctx_p, ch); + + re_insert_atom_iterator (re_ctx_p, atom_offset); + break; + } + } + } + + return ECMA_VALUE_EMPTY; +} /* re_parse_alternative */ + /** * @} * @} diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h index 7e3c2e2c69..1540968b27 100644 --- a/jerry-core/parser/regexp/re-parser.h +++ b/jerry-core/parser/regexp/re-parser.h @@ -18,45 +18,18 @@ #if ENABLED (JERRY_BUILTIN_REGEXP) +#include "re-compiler-context.h" + /** \addtogroup parser Parser * @{ * * \addtogroup regexparser Regular expression * @{ * - * \addtogroup regexparser_bytecode Bytecode + * \addtogroup regexparser_parser Parser * @{ */ -/** - * RegExp token type definitions - */ -typedef enum -{ - RE_TOK_EOF, /**< EOF */ - RE_TOK_BACKREFERENCE, /**< "\[0..9]" */ - RE_TOK_CHAR, /**< any character */ - RE_TOK_ALTERNATIVE, /**< "|" */ - RE_TOK_ASSERT_START, /**< "^" */ - RE_TOK_ASSERT_END, /**< "$" */ - RE_TOK_PERIOD, /**< "." */ - RE_TOK_START_CAPTURE_GROUP, /**< "(" */ - RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */ - RE_TOK_END_GROUP, /**< ")" */ - RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< "(?=" */ - RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< "(?!" */ - RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */ - RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */ - RE_TOK_DIGIT, /**< "\d" */ - RE_TOK_NOT_DIGIT, /**< "\D" */ - RE_TOK_WHITE, /**< "\s" */ - RE_TOK_NOT_WHITE, /**< "\S" */ - RE_TOK_WORD_CHAR, /**< "\w" */ - RE_TOK_NOT_WORD_CHAR, /**< "\W" */ - RE_TOK_START_CHAR_CLASS, /**< "[ ]" */ - RE_TOK_START_INV_CHAR_CLASS, /**< "[^ ]" */ -} re_token_type_t; - /** * @} * @@ -65,43 +38,16 @@ typedef enum */ /** - * RegExp constant of infinite + * Value used for infinite quantifier. */ -#define RE_ITERATOR_INFINITE ((uint32_t) - 1) +#define RE_INFINITY UINT32_MAX /** - * Maximum number of decimal escape digits + * Maximum decimal value of an octal escape */ -#define RE_MAX_RE_DECESC_DIGITS 9 - -/** - * RegExp token type - */ -typedef struct -{ - re_token_type_t type; /**< type of the token */ - uint32_t value; /**< value of the token */ - uint32_t qmin; /**< minimum number of token iterations */ - uint32_t qmax; /**< maximum number of token iterations */ - bool greedy; /**< type of iteration */ -} re_token_t; - -/** - * RegExp parser context - */ -typedef struct -{ - const lit_utf8_byte_t *input_start_p; /**< start of input pattern */ - const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */ - const lit_utf8_byte_t *input_end_p; /**< end of input pattern */ - int groups_count; /**< number of groups */ - uint32_t classes_count; /**< number of character classes */ -} re_parser_ctx_t; +#define RE_MAX_OCTAL_VALUE 0xff -bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, uint32_t lookup); -uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p); -ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, re_token_t *re_token_p); -ecma_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p); +ecma_value_t re_parse_alternative (re_compiler_ctx_t *re_ctx_p, bool expect_eof); /** * @} diff --git a/jerry-core/parser/regexp/re-token.h b/jerry-core/parser/regexp/re-token.h new file mode 100644 index 0000000000..fd203a196a --- /dev/null +++ b/jerry-core/parser/regexp/re-token.h @@ -0,0 +1,72 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RE_TOKEN_H +#define RE_TOKEN_H + +#if ENABLED (JERRY_BUILTIN_REGEXP) + +/** \addtogroup parser Parser + * @{ + * + * \addtogroup regexparser Regular expression + * @{ + * + * \addtogroup regexparser_parser Parser + * @{ + */ + +/** + * RegExp token type definitions + */ +typedef enum +{ + RE_TOK_EOF, /**< EOF */ + RE_TOK_BACKREFERENCE, /**< "\[0..9]" */ + RE_TOK_ALTERNATIVE, /**< "|" */ + RE_TOK_ASSERT_START, /**< "^" */ + RE_TOK_ASSERT_END, /**< "$" */ + RE_TOK_PERIOD, /**< "." */ + RE_TOK_START_CAPTURE_GROUP, /**< "(" */ + RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */ + RE_TOK_END_GROUP, /**< ")" */ + RE_TOK_ASSERT_LOOKAHEAD, /**< "(?=" */ + RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */ + RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */ + RE_TOK_CLASS_ESCAPE, /**< "\d \D \w \W \s \S" */ + RE_TOK_CHAR_CLASS, /**< "[ ]" */ + RE_TOK_CHAR, /**< any character */ +} re_token_type_t; + +/** + * RegExp token + */ +typedef struct +{ + uint32_t value; /**< value of the token */ + uint32_t qmin; /**< minimum number of token iterations */ + uint32_t qmax; /**< maximum number of token iterations */ + re_token_type_t type; /**< type of the token */ + bool greedy; /**< type of iteration */ +} re_token_t; + +/** + * @} + * @} + * @} + */ + +#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */ +#endif /* !RE_TOKEN_H */ diff --git a/tests/jerry/es2015/regexp-unicode.js b/tests/jerry/es2015/regexp-unicode.js new file mode 100644 index 0000000000..60ac33e836 --- /dev/null +++ b/tests/jerry/es2015/regexp-unicode.js @@ -0,0 +1,361 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var result = /\0/.exec("\u0000"); +assert (result !== null); +assert (result[0] === "\u0000"); + +result = /\0/u.exec("\u0000"); +assert (result !== null); +assert (result[0] === "\u0000"); + +result = /\000/.exec("\u0000"); +assert (result !== null); +assert (result[0] === "\u0000"); + +try { + new RegExp("\\000", 'u').exec("\u0000"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\0000/.exec("\u0000\u0030"); +assert (result !== null); +assert (result[0] === "\u0000\u0030"); + +result = /\377/.exec("\u00ff"); +assert (result !== null); +assert (result[0] === "\u00ff"); + +try { + new RegExp("\\377", 'u').exec("\u00ff"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\3777/.exec("\u00ff\u0037"); +assert (result !== null); +assert (result[0] === "\u00ff\u0037"); + +try { + new RegExp("\\3777", 'u').exec("\u00ff\u0037"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\400/.exec("\u0020\u0030"); +assert (result !== null); +assert (result[0] === "\u0020\u0030"); + +try { + new RegExp("\\400", 'u').exec("\u0020\u0030"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /(\1)/.exec("\u0001"); +assert (result !== null); +assert (result[0].length === 0); + +result = /(\1)/u.exec("\u0001"); +assert (result !== null); +assert (result[0].length === 0); + +result = /(\2)/.exec("\u0002"); +assert (result !== null); +assert (result[0] === '\u0002'); + +try { + new RegExp("(\\2)", 'u').exec("\u0002"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\8/.exec("\u0038"); +assert (result !== null); +assert (result[0] === '8'); + +result = /\99/.exec("\u0039\u0039"); +assert (result !== null); +assert (result[0] === "99"); + +// CharClassEscape +assert (/\d+/.exec("123")[0] === "123"); +assert (/\D+/.exec("abc")[0] === "abc"); +assert (/\s+/.exec(" ")[0] === " "); +assert (/\S+/.exec("abc")[0] === "abc"); +assert (/\w+/.exec("abc")[0] === "abc"); +assert (/\W+/.exec("|||")[0] === "|||"); +assert (/\d+/u.exec("123")[0] === "123"); +assert (/\D+/u.exec("abc")[0] === "abc"); +assert (/\s+/u.exec(" ")[0] === " "); +assert (/\S+/u.exec("abc")[0] === "abc"); +assert (/\w+/u.exec("abc")[0] === "abc"); +assert (/\W+/u.exec("|||")[0] === "|||"); + +assert (/\d+/u.exec("\u{10CAF}") === null); +assert (/\D+/u.exec("\u{10CAF}")[0] === "\u{10CAF}"); +assert (/\s+/u.exec("\u{10CAF}") === null); +assert (/\S+/u.exec("\u{10CAF}")[0] === "\u{10CAF}"); +assert (/\w+/u.exec("\u{10CAF}") === null); +assert (/\W+/u.exec("\u{10CAF}")[0] === "\u{10CAF}"); + +result = /\xz/.exec("xz"); +assert (result !== null); +assert (result[0] === "xz"); + +try { + new RegExp("\\xz", "u").exec("xz"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\c/.exec("\\c"); +assert (result !== null); +assert (result[0] === "\\c"); + +try { + new RegExp("\\c", 'u').exec("\\c") + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +result = /\c1/.exec("\\c1"); +assert (result !== null); +assert (result[0] === "\\c1"); + +try { + new RegExp("\\c1", 'u').exec("\\c1"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("^+"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("$+"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("\\b+"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("\\B+"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/[\b]/.exec("\u0008")[0] === "\u0008"); +assert (/[\b]/u.exec("\u0008")[0] === "\u0008"); +assert (/[\B]/.exec("\u0042")[0] === "\u0042"); + +try { + new RegExp ("[\\B]", 'u').exec("\u0042"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/[\c1]/.exec("\u0011")[0] === "\u0011"); +assert (/[\c_]/.exec("\u001f")[0] === "\u001f"); +assert (/[\c]/.exec("\\")[0] === "\\"); +assert (/[\c]/.exec("c")[0] === "c"); + +try { + new RegExp("[\\c1]", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("[\\c]", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("[\\c_]", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/{{1,2}/.exec("{{")[0] === "{{"); + +try { + new RegExp("{{1,2}", 'u').exec("{{"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/a{1,2/.exec("a{1,2")[0] === "a{1,2"); + +try { + new RegExp("a{1,2", 'u').exec("a{1,2"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/\u017f/i.exec("s") === null); +assert (/\u017f/ui.exec("s")[0] === "s"); + +assert (/𐲯/.exec("𐲯")[0] === "𐲯"); +assert (/𐲯/u.exec("𐲯")[0] === "𐲯"); +assert (/𐲯*?/.exec("𐲯")[0] === "\ud803"); +assert (/𐲯*?/u.exec("𐲯")[0] === ""); +assert (/𐲯+/.exec("𐲯𐲯𐲯")[0] === "𐲯"); +assert (/𐲯+/u.exec("𐲯𐲯𐲯")[0] === "𐲯𐲯𐲯"); + +assert (/\ud803\udc96*?/.exec("𐲖")[0] === '\ud803'); +assert (/\ud803\udc96*?/u.exec("𐲖")[0] === ''); +assert (/\ud803\udc96+/.exec("𐲖𐲖𐲖")[0] === '𐲖'); +assert (/\ud803\udc96+/u.exec("𐲖𐲖𐲖")[0] === '𐲖𐲖𐲖'); + +assert (/.*𐲗𐲘/u.exec("𐲓𐲔𐲕𐲖𐲗𐲘")[0] === '𐲓𐲔𐲕𐲖𐲗𐲘'); + +assert (/[\u{10000}]/.exec("\u{10000}") === null); +assert (/[\u{10000}]/.exec("{")[0] === "{"); +assert (/[^\u{10000}]/.exec("\u{10000}")[0] === "\ud800"); +assert (/[^\u{10000}]/.exec("{") === null); + +assert (/[\uffff]/.exec("\uffff")[0] === "\uffff"); +assert (/[^\uffff]/.exec("\uffff") === null); + +assert (/[\u{10000}]/u.exec("\u{10000}")[0] === "\u{10000}"); +assert (/[\u{10000}]/u.exec("{") === null); +assert (/[^\u{10000}]/u.exec("\u{10000}") === null); +assert (/[^\u{10000}]/u.exec("{")[0] === "{"); + +assert (/[\uffff]/u.exec("\uffff")[0] === "\uffff"); +assert (/[^\uffff]/u.exec("\uffff") === null); + +assert (/a{4294967296,4294967297}/.exec("aaaa") === null); +assert (/a{4294967294,4294967295}/.exec("aaaa") === null); +assert (/a{0000000000000000001,0000000000000000002}/u.exec("aaaa")[0] === 'aa'); +assert (/(\4294967297)/.exec("\4294967297")[0] === "\4294967297"); +assert (/(\1)/u.exec("aaaa")[0] === ""); + +try { + new RegExp("a{4294967295,4294967294}", ''); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/[\d-\s]/.exec("-")[0] === "-"); +assert (/[0-\s]/.exec("-")[0] === "-"); +assert (/[\d-0]/.exec("-")[0] === "-"); + +try { + new RegExp("[\\d-\\s]", 'u').exec("-"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("[0-\\s]", 'u').exec("-"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("[\\d-0]", 'u').exec("-"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/[-]/.exec("-")[0] === "-"); +assert (/[-]/u.exec("-")[0] === "-"); +assert (/[--]/.exec("-")[0] === "-"); +assert (/[--]/u.exec("-")[0] === "-"); + +assert (/}/.exec("}")[0] === "}"); +assert (/\}/u.exec("}")[0] === "}"); + +try { + new RegExp("}", 'u').exec("}"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/]/.exec("]")[0] === "]"); +assert (/\]/u.exec("]")[0] === "]"); + +try { + new RegExp("]", 'u').exec("]"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/(?=)*/.exec("")[0] === ""); +assert (/(?=)+/.exec("")[0] === ""); +assert (/(?=){1,2}/.exec("")[0] === ""); + +try { + new RegExp("(?=)*", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("(?=)+", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("(?=){1,2}", 'u'); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +try { + new RegExp("(?=){2,1}", ''); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} diff --git a/tests/jerry/regexp-alternatives.js b/tests/jerry/regexp-alternatives.js index d084d459c9..379702c351 100644 --- a/tests/jerry/regexp-alternatives.js +++ b/tests/jerry/regexp-alternatives.js @@ -58,3 +58,6 @@ assert (r.exec("a") == "a"); r = new RegExp ("a|bb|c|d"); assert (r.exec("b") == undefined); + +r = new RegExp("(?:a|b)\\b|\\.\\w+", "g"); +assert (r.exec("name.lower()")[0] === ".lower") diff --git a/tests/jerry/regexp-backreference.js b/tests/jerry/regexp-backreference.js index 2551cd5410..55b92f3694 100644 --- a/tests/jerry/regexp-backreference.js +++ b/tests/jerry/regexp-backreference.js @@ -24,3 +24,6 @@ assert (r == undefined); r = new RegExp ("(a)*b\\1").exec("b"); assert (r[0] == "b"); assert (r[1] == undefined); + +assert (JSON.stringify (/[[]?(a)\1/.exec("aa")) === '["aa","a"]'); +assert (JSON.stringify (/\1{2,5}()\B/.exec("asd")) === '["",""]'); diff --git a/tests/jerry/regexp-backtrack.js b/tests/jerry/regexp-backtrack.js new file mode 100644 index 0000000000..3099fe78db --- /dev/null +++ b/tests/jerry/regexp-backtrack.js @@ -0,0 +1,115 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +assert (JSON.stringify (/(?:(a)*){3,}/.exec("aaaab")) === '["aaaa",null]'); +assert (JSON.stringify (/((a)*){3,}/.exec("aaaab")) === '["aaaa","",null]'); +assert (JSON.stringify (/((a)+){3,}/.exec("aaaab")) === '["aaaa","a","a"]'); +assert (JSON.stringify (/((.)*){3,}/.exec("abcd")) === '["abcd","",null]'); +assert (JSON.stringify (/((.)+){3,}/.exec("abcd")) === '["abcd","d","d"]'); + +assert (JSON.stringify (/((.){1,2}){1,2}/.exec("abc")) === '["abc","c","c"]'); +assert (JSON.stringify (/(?:(a)*?)asd/.exec("aaasd")) === '["aaasd","a"]'); +assert (JSON.stringify (/(?:(a)*)asd/.exec("aaasd")) === '["aaasd","a"]'); + +assert (JSON.stringify (/(.)*((a)*|(b)*)/.exec("ab")) === '["ab","b","",null,null]'); +assert (JSON.stringify (/(.)*((x)|(y))+/.exec("xy")) === '["xy","x","y",null,"y"]'); +assert (JSON.stringify (/(.)*((y)|(x))+/.exec("xy")) === '["xy","x","y","y",null]'); + +assert (JSON.stringify (/((?:a)*)/.exec("aaaad")) === '["aaaa","aaaa"]'); +assert (JSON.stringify (/((y)+|x)+/.exec("x")) === '["x","x",null]'); +assert (JSON.stringify (/((?:y)*|x)+/.exec("x")) === '["x","x"]'); +assert (JSON.stringify (/((y)*|x)+/.exec("x")) === '["x","x",null]'); +assert (JSON.stringify (/((y)*|x)*/.exec("x")) === '["x","x",null]'); +assert (JSON.stringify (/(?:(y)*|x)*/.exec("x")) === '["x",null]'); +assert (JSON.stringify (/(?:(y)*|(x))*/.exec("x")) === '["x",null,"x"]'); + +assert (JSON.stringify (/((?:a)*)asd/.exec("aaasd")) === '["aaasd","aa"]'); +assert (JSON.stringify (/((?:a)+)asd/.exec("aaasd")) === '["aaasd","aa"]'); +assert (JSON.stringify (/((?:a)*?)asd/.exec("aaasd")) === '["aaasd","aa"]'); +assert (JSON.stringify (/((?:a)+?)asd/.exec("aaasd")) === '["aaasd","aa"]'); + +assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]'); +assert (JSON.stringify (/((y)|(z)|(.))*/.exec("yaz")) === '["yaz","z",null,"z",null]'); +assert (JSON.stringify (/((y)*|(z)*|(a)*)*/.exec("yazx")) === '["yaz","z",null,"z",null]') +assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]') +assert (JSON.stringify (/(?:(y)|(z)|(a))*/.exec("yazx")) === '["yaz",null,"z",null]') +assert (JSON.stringify (/((y)|(z)|(a))+?/.exec("yazx")) === '["y","y","y",null,null]') +assert (JSON.stringify (/(?:(y)|(z)|(a))+?/.exec("yazx")) === '["y","y",null,null]') + +assert (JSON.stringify (/(?:(x|y)*|z)*/.exec("yz")) === '["yz",null]'); +assert (JSON.stringify (/((x|y)*|z)*/.exec("yz")) == '["yz","z",null]'); +assert (JSON.stringify (/(((x|y)*|(v|w)*|z)*)asd/.exec("xyzwvxzasd")) === '["xyzwvxzasd","xyzwvxz","z",null,null]'); + +assert (JSON.stringify (/((a)*){1,3}b/.exec("ab")) === '["ab","a","a"]') +assert (JSON.stringify (/((a)*){2,3}b/.exec("ab")) === '["ab","",null]') +assert (JSON.stringify (/((a)*){3,3}b/.exec("ab")) === '["ab","",null]') + +assert (JSON.stringify (/((a)*){3,}b/.exec("aaaab")) === '["aaaab","",null]'); +assert (JSON.stringify (/((a)*)*b/.exec("aaaab")) === '["aaaab","aaaa","a"]'); + +assert (JSON.stringify (/((bb?)*)*a/.exec("bbba")) === '["bbba","bbb","b"]'); +assert (JSON.stringify (/((b)*)*a/.exec("bbba")) === '["bbba","bbb","b"]'); + +assert (JSON.stringify (/(aa|a)a/.exec("aa")) === '["aa","a"]'); +assert (JSON.stringify (/(aa|a)?a/.exec("aa")) === '["aa","a"]'); +assert (JSON.stringify (/(aa|a)+?a/.exec("aa")) === '["aa","a"]'); +assert (JSON.stringify (/(?:aa|a)a/.exec("aa")) === '["aa"]'); +assert (JSON.stringify (/(?:aa|a)?a/.exec("aa")) === '["aa"]'); +assert (JSON.stringify (/(?:aa|a)+?a/.exec("aa")) === '["aa"]'); + +assert (JSON.stringify (/(aa|a)a/.exec("a")) === 'null'); +assert (JSON.stringify (/(aa|a)?a/.exec("a")) === '["a",null]'); +assert (JSON.stringify (/(aa|a)+?a/.exec("a")) === 'null'); +assert (JSON.stringify (/(?:aa|a)a/.exec("a")) === 'null'); +assert (JSON.stringify (/(?:aa|a)?a/.exec("a")) === '["a"]'); +assert (JSON.stringify (/(?:aa|a)+?a/.exec("a")) === 'null'); + +assert (JSON.stringify (/a+/.exec("aaasd")) === '["aaa"]'); +assert (JSON.stringify (/a+?/.exec("aaasd")) === '["a"]'); + +assert (JSON.stringify (/a+sd/.exec("aaasd")) === '["aaasd"]'); +assert (JSON.stringify (/a+?sd/.exec("aaasd")) === '["aaasd"]'); + +assert (JSON.stringify (/a{2}sd/.exec("aaasd")) === '["aasd"]'); +assert (JSON.stringify (/a{3}sd/.exec("aaasd")) === '["aaasd"]'); + +assert (JSON.stringify (/(?=a)/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?=a)+/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?=a)*/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?=(a))?/.exec("a")) === '["",null]'); +assert (JSON.stringify (/(?=(a))+?/.exec("a")) === '["","a"]'); +assert (JSON.stringify (/(?=(a))*?/.exec("a")) === '["",null]'); + +assert (JSON.stringify (/(?!a)/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?!a)+/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?!a)*/.exec("a")) === '[""]'); +assert (JSON.stringify (/(?!(a))?/.exec("a")) === '["",null]'); +assert (JSON.stringify (/(?!(a))+?/.exec("a")) === '["",null]'); +assert (JSON.stringify (/(?!(a))*?/.exec("a")) === '["",null]'); + +assert (JSON.stringify (/al(?=(ma))*ma/.exec("alma")) === '["alma",null]'); +assert (JSON.stringify (/al(?!(ma))*ma/.exec("alma")) === '["alma",null]'); +assert (JSON.stringify (/al(?=(ma))+ma/.exec("alma")) === '["alma","ma"]'); +assert (JSON.stringify (/al(?!(ma))+ma/.exec("alma")) === 'null'); + +assert (JSON.stringify (/(?=())x|/.exec("asd")) === '["",null]'); +assert (JSON.stringify (/(?!())x|/.exec("asd")) === '["",null]'); + +assert (JSON.stringify (/(().*)+.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]'); +assert (JSON.stringify (/(().*)+?.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]'); +assert (JSON.stringify (/(?:().*)+.$/.exec("abcdefg")) === '["abcdefg",""]'); +assert (JSON.stringify (/(?:().*)+?.$/.exec("abcdefg")) === '["abcdefg",""]'); + +assert (JSON.stringify(/((?=())|.)+^/.exec("a")) === '["","",""]'); +assert (JSON.stringify(/(?:(|\b\w+?){2})+$/.exec("aaaa")) === '["aaaa","aaaa"]'); diff --git a/tests/jerry/regexp-capture-groups.js b/tests/jerry/regexp-capture-groups.js index 801e062a26..c3644d56fe 100644 --- a/tests/jerry/regexp-capture-groups.js +++ b/tests/jerry/regexp-capture-groups.js @@ -196,3 +196,12 @@ assert (r.exec("aa") == "aa,a"); r = new RegExp ("(a{0,1}?){0,1}a"); assert (r.exec("aa") == "aa,a"); + +r = new RegExp ("(|.)+"); +assert (JSON.stringify (r.exec("asdfgh")) === '["asdfgh","h"]'); + +assert (JSON.stringify (/([^\W](){8,}?){5}/.exec("asdfghijk")) === '["asdfg","g",""]'); +assert (JSON.stringify (/(()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi","",null,null]') +assert (JSON.stringify (/(()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi","",null,null]') +assert (JSON.stringify (/(?:()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi",null,null]') +assert (JSON.stringify (/(?:()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi",null,null]') diff --git a/tests/jerry/regexp-simple-atom-and-iterations.js b/tests/jerry/regexp-simple-atom-and-iterations.js index c1f15da91b..19cf701015 100644 --- a/tests/jerry/regexp-simple-atom-and-iterations.js +++ b/tests/jerry/regexp-simple-atom-and-iterations.js @@ -88,3 +88,6 @@ assert (r.exec ("\\c3") == "\\c3"); r = /\cIasd/; assert (r.exec ("\tasd") == "\tasd"); + +r = /.??$/; +assert (JSON.stringify (r.exec("asd")) === '["d"]'); diff --git a/tests/jerry/regression-test-issue-2190.js b/tests/jerry/regression-test-issue-2190.js index ed229a2016..a811572b40 100644 --- a/tests/jerry/regression-test-issue-2190.js +++ b/tests/jerry/regression-test-issue-2190.js @@ -13,7 +13,7 @@ // limitations under the License. try { - /(?:(?=x)){1000}xyz/.exec('xyz'); + /(?:(?=x)){10000}xyz/.exec('xyz'); assert(false); } catch (e) { assert(e instanceof RangeError); diff --git a/tests/jerry/string-prototype-trim.js b/tests/jerry/string-prototype-trim.js index 2750e0ce29..689d9d33ea 100644 --- a/tests/jerry/string-prototype-trim.js +++ b/tests/jerry/string-prototype-trim.js @@ -85,3 +85,5 @@ assert("\u000A\u000D\u2028\u202911".trim() === "11"); assert("\u0009\u000B\u000C\u0020\u00A01\u0009\u000B\u000C\u0020\u00A0".trim() === "1"); assert("\u000A\u000D\u2028\u202911\u000A\u000D\u2028\u2029".trim() === "11"); + +assert ("\u200B".trim() === '\u200B')