From 7e5f9904f02437650b6bd11751c014dc508b19e7 Mon Sep 17 00:00:00 2001 From: Kenji Fukuda Date: Tue, 3 Jul 2018 14:58:43 -0700 Subject: [PATCH] Fixing RegExp parsing for character classes interacting with ranges. Fixes #258 --- lib/Parser/DebugWriter.cpp | 2 + lib/Parser/RegexParser.cpp | 32 +++-- test/Regex/characterclass_with_range.js | 163 ++++++++++++++++++++++++ test/Regex/rlexe.xml | 6 + 4 files changed, 195 insertions(+), 8 deletions(-) create mode 100644 test/Regex/characterclass_with_range.js diff --git a/lib/Parser/DebugWriter.cpp b/lib/Parser/DebugWriter.cpp index 7ee268e89bd..4d1caf03958 100644 --- a/lib/Parser/DebugWriter.cpp +++ b/lib/Parser/DebugWriter.cpp @@ -72,6 +72,8 @@ namespace UnifiedRegex CheckForNewline(); if (c > 0xff) Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]); + else if (c == '-') + Output::Print(_u("\\x2d")); else if (c < ' ' || c > '~') Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]); else diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp index 619a4f6e89e..692527de0c3 100644 --- a/lib/Parser/RegexParser.cpp +++ b/lib/Parser/RegexParser.cpp @@ -1931,6 +1931,7 @@ namespace UnifiedRegex codepoint_t pendingRangeStart = INVALID_CODEPOINT; codepoint_t pendingRangeEnd = INVALID_CODEPOINT; bool previousSurrogatePart = false; + while(nextChar != ']') { current = next; @@ -2034,7 +2035,7 @@ namespace UnifiedRegex lastCodepoint = INVALID_CODEPOINT; } - // If we the next character is the end of range ']', then we can't have a surrogate pair. + // If the next character is the end of range ']', then we can't have a surrogate pair. // The current character is the range end, if we don't already have a candidate. else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT) { @@ -2124,6 +2125,10 @@ namespace UnifiedRegex codepoint_t pendingRangeStart = INVALID_CODEPOINT; EncodedChar nextChar = ECLookahead(); bool previousWasASurrogate = false; + bool currIsACharSet = false; + bool prevWasACharSet = false; + bool prevprevWasACharSet = false; + while(nextChar != ']') { codepoint_t codePointToSet = INVALID_CODEPOINT; @@ -2147,30 +2152,30 @@ namespace UnifiedRegex else if (nextChar == '\\') { Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate); + codePointToSet = pendingCodePoint; if (returnedNode->tag == Node::MatchSet) { - codePointToSet = pendingCodePoint; - pendingCodePoint = INVALID_CODEPOINT; + pendingCodePoint = nextChar; if (pendingRangeStart != INVALID_CODEPOINT) { codePointSet.Set(ctAllocator, '-'); } pendingRangeStart = INVALID_CODEPOINT; codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set); + currIsACharSet = true; } else { // Just a character - codePointToSet = pendingCodePoint; pendingCodePoint = deferredCharNode.cs[0]; } } else if (nextChar == '-') { - if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']') + if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) || ECLookahead(1) == ']') { - // - is just a char, or end of a range. + // - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range. codePointToSet = pendingCodePoint; pendingCodePoint = '-'; ECConsume(); @@ -2192,14 +2197,22 @@ namespace UnifiedRegex { if (pendingRangeStart != INVALID_CODEPOINT) { - if (pendingRangeStart > pendingCodePoint) + if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet) { //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option Assert(!unicodeFlagPresent); Fail(JSERR_RegExpBadRange); } - codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint); + if (prevprevWasACharSet) + { + codePointSet.Set(ctAllocator, '-'); + codePointSet.Set(ctAllocator, pendingCodePoint); + } + else + { + codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint); + } pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT; } else @@ -2209,6 +2222,9 @@ namespace UnifiedRegex } nextChar = ECLookahead(); + prevprevWasACharSet = prevWasACharSet; + prevWasACharSet = currIsACharSet; + currIsACharSet = false; } if (pendingCodePoint != INVALID_CODEPOINT) diff --git a/test/Regex/characterclass_with_range.js b/test/Regex/characterclass_with_range.js new file mode 100644 index 00000000000..c9b343b9e5b --- /dev/null +++ b/test/Regex/characterclass_with_range.js @@ -0,0 +1,163 @@ +//------------------------------------------------------------------------------------------------------- +// Copyright (C) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. +//------------------------------------------------------------------------------------------------------- + +WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js"); + +let re = /^[\s-a-z]$/; +let reIgnoreCase = /^[\s-a-z]$/i; +let reUnicode = /^[\s-z]$/u; +let reNoCharClass = /^[a-c-z]$/; + +var tests = [ + /*No Flag RegExp Tests begin*/ + { + name : "Ensure 'a-z' not counted as range", + body : function () + { + assert.isFalse(re.test("b")); + } + }, + { + name : "Ensure 'a' included in set", + body : function () + { + assert.isTrue(re.test("a")); + } + }, + { + name : "Ensure ' ' included in set", + body : function () + { + assert.isTrue(re.test(" ")); + } + }, + { + name : "Ensure 'z' included in set", + body : function () + { + assert.isTrue(re.test("z")); + } + }, + { + name : "Ensure '\t' included in set", + body : function () + { + assert.isTrue(re.test("\t")); + } + }, + { + name : "Ensure 'a-z' not counted as range", + body : function () + { + assert.isFalse(re.test("q")); + } + }, + { + name : "Ensure '\' not counted in set", + body : function () + { + assert.isFalse(re.test("\\")); + } + }, + /*No Flag RegExp Tests End*/ + /*IgnoreCase Flag RegExp Tests Begin*/ + { + name : "Ensure 'O' not included in set", + body : function () + { + assert.isFalse(reIgnoreCase.test("O")); + } + }, + { + name : "Ensure 'A' included in set", + body : function () + { + assert.isTrue(reIgnoreCase.test("A")); + } + }, + { + name : "Ensure ' ' included in set", + body : function () + { + assert.isTrue(reIgnoreCase.test(" ")); + } + }, + { + name : "Ensure 'z' included in set", + body : function () + { + assert.isTrue(reIgnoreCase.test("z")); + } + }, + { + name : "Ensure '\t' included in set", + body : function () + { + assert.isTrue(reIgnoreCase.test("\t")); + } + }, + /*IgnoreCase Flag RegExp Tests End*/ + /*Unicode Flag RegExp Tests Begin*/ + { + name : "'-' included in set since \s-z treated as union of three types, not range", + body : function () + { + assert.isTrue(reUnicode.test("-")); + } + }, + { + name : "' ' in set from \s character set", + body : function () + { + assert.isTrue(reUnicode.test(" ")); + } + }, + { + name : "b not included in '\s-z'", + body : function () + { + assert.isFalse(reUnicode.test("b")); + } + }, + /*Unicode Flag RegExp Tests End*/ + /*Non-character class tests Begin*/ + { + name : "First range is used", + body : function () + { + assert.isTrue(reNoCharClass.test("b")); + } + }, + { + name : "'-' included in set from 2nd dash", + body : function () + { + assert.isTrue(reNoCharClass.test("-")); + } + }, + { + name : "z included in set", + body : function () + { + assert.isTrue(reNoCharClass.test("z")); + } + }, + { + name : "'c-z' not viewed as range", + body : function () + { + assert.isFalse(reNoCharClass.test("y")); + } + } + /*Non-character class tests End*/ +]; + +if (typeof modifyTests !== "undefined") { + tests = modifyTests(tests); +} + +testRunner.runTests(tests, { + verbose : WScript.Arguments[0] != "summary" +}); diff --git a/test/Regex/rlexe.xml b/test/Regex/rlexe.xml index 6d957a49b5a..775d059f564 100644 --- a/test/Regex/rlexe.xml +++ b/test/Regex/rlexe.xml @@ -229,4 +229,10 @@ -args summary -endargs + + + characterclass_with_range.js + -args summary -endargs + +