Skip to content

Commit

Permalink
[1.10>master] [MERGE #5418 @kfukuda2] Fixing RegExp parsing for chara…
Browse files Browse the repository at this point in the history
…cter classes interacting with ranges.

Merge pull request #5418 from kfukuda2:RegExpCharacterClassRangeFix

Fixes #258
  • Loading branch information
Kenji Fukuda committed Jul 10, 2018
2 parents c567e4e + 288d7fe commit 2aeb63f
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 33 deletions.
2 changes: 2 additions & 0 deletions lib/Parser/DebugWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ namespace UnifiedRegex
CheckForNewline();
if (c > 0xff)
Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
else if (c == '-')
Output::Print(_u("\\x2d"));
else if (c < ' ' || c > '~')
Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
else
Expand Down
47 changes: 41 additions & 6 deletions lib/Parser/RegexParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1931,6 +1931,7 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
bool previousSurrogatePart = false;

while(nextChar != ']')
{
current = next;
Expand Down Expand Up @@ -2034,7 +2035,7 @@ namespace UnifiedRegex

lastCodepoint = INVALID_CODEPOINT;
}
// If we the next character is the end of range ']', then we can't have a surrogate pair.
// If the next character is the end of range ']', then we can't have a surrogate pair.
// The current character is the range end, if we don't already have a candidate.
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
{
Expand Down Expand Up @@ -2124,6 +2125,10 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
EncodedChar nextChar = ECLookahead();
bool previousWasASurrogate = false;
bool currIsACharSet = false;
bool prevWasACharSetAndPartOfRange = false;
bool prevprevWasACharSetAndPartOfRange = false;

while(nextChar != ']')
{
codepoint_t codePointToSet = INVALID_CODEPOINT;
Expand All @@ -2133,6 +2138,7 @@ namespace UnifiedRegex
{
ECConsume();
}

// These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
{
Expand All @@ -2147,22 +2153,30 @@ namespace UnifiedRegex
else if (nextChar == '\\')
{
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
codePointToSet = pendingCodePoint;

if (returnedNode->tag == Node::MatchSet)
{
codePointToSet = pendingCodePoint;
pendingCodePoint = INVALID_CODEPOINT;
if (pendingRangeStart != INVALID_CODEPOINT)
{
if (unicodeFlagPresent)
{
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
}

codePointSet.Set(ctAllocator, '-');
}

pendingCodePoint = INVALID_CODEPOINT;
pendingRangeStart = INVALID_CODEPOINT;
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
currIsACharSet = true;
}
else
{
// Just a character
codePointToSet = pendingCodePoint;
pendingCodePoint = deferredCharNode.cs[0];
}
}
Expand All @@ -2188,9 +2202,26 @@ namespace UnifiedRegex
pendingCodePoint = NextChar();
}

if (codePointToSet != INVALID_CODEPOINT)
if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
{
if (pendingRangeStart != INVALID_CODEPOINT)
if (prevprevWasACharSetAndPartOfRange)
{
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
if (unicodeFlagPresent)
{
Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
}

if (pendingCodePoint != INVALID_CODEPOINT)
{
codePointSet.Set(ctAllocator, pendingCodePoint);
}

codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
}
else if (pendingRangeStart != INVALID_CODEPOINT)
{
if (pendingRangeStart > pendingCodePoint)
{
Expand All @@ -2199,6 +2230,7 @@ namespace UnifiedRegex
Assert(!unicodeFlagPresent);
Fail(JSERR_RegExpBadRange);
}

codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
}
Expand All @@ -2209,6 +2241,9 @@ namespace UnifiedRegex
}

nextChar = ECLookahead();
prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
currIsACharSet = false;
}

if (pendingCodePoint != INVALID_CODEPOINT)
Expand Down
1 change: 1 addition & 0 deletions lib/Parser/rterrors.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "%s", "Character classes not allowed in a RegExp class range.", kjstSyntaxError, 0)

//Host errors
RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)
Expand Down
140 changes: 140 additions & 0 deletions test/Regex/characterclass_with_range.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------

WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");

function matchRegExp(str, regexp, expectedResult)
{
matchResult = str.match(regexp);//regexp.test(str);
errorMsg = "Expected result of match between string: '" + str + "' and regular expression: " + regexp + " to be " +
expectedResult + " but was " + matchResult;

actualResult = matchResult == null ? null : matchResult[0];
assert.areEqual(expectedResult, actualResult, errorMsg);
}

var tests = [
{
name : "RegExp tests with no flags",
body : function ()
{
let re = /[\s-a-z]/;
matchRegExp("b", re, null);
matchRegExp("a", re, "a");
matchRegExp(" ", re, " ");
matchRegExp("z", re, "z");
matchRegExp("\t", re, "\t");
matchRegExp("q", re, null);
matchRegExp("\\", re, null);
matchRegExp("\u2028", re, "\u2028");
matchRegExp("\u2009", re, "\u2009");
}
},
{
name : "RegExp tests with IgnoreCase flag set",
body : function ()
{
let reIgnoreCase = /^[\s-a-z]$/i;
matchRegExp("O", reIgnoreCase, null);
matchRegExp("A", reIgnoreCase, "A");
matchRegExp(" ", reIgnoreCase, " ");
matchRegExp("z", reIgnoreCase, "z");
matchRegExp("\t", reIgnoreCase, "\t");
matchRegExp("\u2028", reIgnoreCase, "\u2028");
matchRegExp("\u2009", reIgnoreCase, "\u2009");
}
},
{
name : "RegExp tests with Unicode flag set",
body : function ()
{
let reUnicode = /^[a-d]$/u;
matchRegExp("a", reUnicode, "a");
matchRegExp("c", reUnicode, "c");
matchRegExp("d", reUnicode, "d");
matchRegExp("C", reUnicode, null);
matchRegExp("g", reUnicode, null);
matchRegExp("\u2028", reUnicode, null);
matchRegExp("\u2009", reUnicode, null);
assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");
assert.throws(() => eval("/^[z-\\s]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");

}
},
{
name : "Non-character class tests",
body : function ()
{
let reNoCharClass = /^[a-c-z]$/;
matchRegExp("b", reNoCharClass, "b");
matchRegExp("-", reNoCharClass, "-");
matchRegExp("z", reNoCharClass, "z");
matchRegExp("y", reNoCharClass, null);
}
},
{
name : "Regression tests from bugFixRegression",
body : function ()
{
matchRegExp(" -abc", /[\s-a-c]*/, " -a");
matchRegExp(" -abc", /[\s\-a-c]*/, " -abc");
matchRegExp(" -ab", /[a-\s-b]*/, " -ab");
matchRegExp(" -ab", /[a\-\s\-b]*/, " -ab");
assert.throws(() => eval("/^[\\s--c-!]$/.exec(\"-./0Abc!\")"), SyntaxError, "Expected an error due to 'c-!' being an invalid range.", "Invalid range in character set");
}
},
{
name : "Special character tests",
body : function ()
{
let re = /^[\s][a\sb][\s--c-f]$/;
matchRegExp(' \\', re, null);
matchRegExp(' \\ ', re, null);
matchRegExp('\\ ', re, null);
re = /[-][\d\-]/;
matchRegExp('--', re, '--');
matchRegExp('-9', re, '-9');
matchRegExp(' ', re, null);
matchRegExp('-\\', re, null);
}
},
{
name : "Negation character set tests",
body : function ()
{
let reNegationCharSet = /[\D-\s]+/;
matchRegExp('555686', reNegationCharSet, null);
matchRegExp('555-686', reNegationCharSet, '-');
matchRegExp('alphabet-123', reNegationCharSet, 'alphabet-');
}
},
{
name : "Non-range tests",
body : function ()
{
let reNonRange = /[-\w]/
matchRegExp('-', reNonRange, '-');
matchRegExp('g', reNonRange, 'g');
matchRegExp('5', reNonRange, '5');
matchRegExp(' ', reNonRange, null);
matchRegExp('\t', reNonRange, null);
matchRegExp('\u2028', reNonRange, null);
matchRegExp('\\', reNonRange, null);

reNonRange = /[\w-]/
matchRegExp('-', reNonRange, '-');
matchRegExp('g', reNonRange, 'g');
matchRegExp('5', reNonRange, '5');
matchRegExp(' ', reNonRange, null);
matchRegExp('\t', reNonRange, null);
matchRegExp('\u2028', reNonRange, null);
matchRegExp('\\', reNonRange, null);
}
}
];

testRunner.runTests(tests, {
verbose : WScript.Arguments[0] != "summary"
});
6 changes: 6 additions & 0 deletions test/Regex/rlexe.xml
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,10 @@
<compile-flags>-args summary -endargs</compile-flags>
</default>
</test>
<test>
<default>
<files>characterclass_with_range.js</files>
<compile-flags>-args summary -endargs</compile-flags>
</default>
</test>
</regress-exe>
20 changes: 0 additions & 20 deletions test/UnifiedRegex/bugFixRegression.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
["b"] /*input="b", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=["b","","","","","","","","",""]
exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");
[" -abc"] /*input=" -abc", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");
[" -abc"] /*input=" -abc", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");
[" -ab"] /*input=" -ab", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");
[" -ab"] /*input=" -ab", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");
[" -./0Abc!"] /*input=" -./0Abc!", index=0*/
r.lastIndex=0
RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]
EXCEPTION
exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");
["xx",undefined] /*input="xxy", index=0*/
Expand Down
7 changes: 0 additions & 7 deletions test/UnifiedRegex/bugFixRegression.js
Original file line number Diff line number Diff line change
Expand Up @@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
exec(/(?:a+)?/, "");
exec(/(?:a||b)?/, "b");

// WOOB1145588
exec(/[\s-a-c]*/, " -abc");
exec(/[\s\-a-c]*/, " -abc");
exec(/[a-\s-b]*/, " -ab");
exec(/[a\-\s\-b]*/, " -ab");
exec(/[\s--c-!]*/, " -./0Abc!");

try {
var r = new RegExp("[\\s-c-a]*", "");
exec(r, " -abc");
Expand Down

0 comments on commit 2aeb63f

Please sign in to comment.