Skip to content

Commit

Permalink
Fixing RegExp parsing for character classes interacting with ranges.
Browse files Browse the repository at this point in the history
  • Loading branch information
Kenji Fukuda committed Jul 6, 2018
1 parent e782dc3 commit 7e5f990
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 8 deletions.
2 changes: 2 additions & 0 deletions lib/Parser/DebugWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ namespace UnifiedRegex
CheckForNewline();
if (c > 0xff)
Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
else if (c == '-')
Output::Print(_u("\\x2d"));
else if (c < ' ' || c > '~')
Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
else
Expand Down
32 changes: 24 additions & 8 deletions lib/Parser/RegexParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1931,6 +1931,7 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
bool previousSurrogatePart = false;

while(nextChar != ']')
{
current = next;
Expand Down Expand Up @@ -2034,7 +2035,7 @@ namespace UnifiedRegex

lastCodepoint = INVALID_CODEPOINT;
}
// If we the next character is the end of range ']', then we can't have a surrogate pair.
// If the next character is the end of range ']', then we can't have a surrogate pair.
// The current character is the range end, if we don't already have a candidate.
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
{
Expand Down Expand Up @@ -2124,6 +2125,10 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
EncodedChar nextChar = ECLookahead();
bool previousWasASurrogate = false;
bool currIsACharSet = false;
bool prevWasACharSet = false;
bool prevprevWasACharSet = false;

while(nextChar != ']')
{
codepoint_t codePointToSet = INVALID_CODEPOINT;
Expand All @@ -2147,30 +2152,30 @@ namespace UnifiedRegex
else if (nextChar == '\\')
{
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
codePointToSet = pendingCodePoint;

if (returnedNode->tag == Node::MatchSet)
{
codePointToSet = pendingCodePoint;
pendingCodePoint = INVALID_CODEPOINT;
pendingCodePoint = nextChar;
if (pendingRangeStart != INVALID_CODEPOINT)
{
codePointSet.Set(ctAllocator, '-');
}
pendingRangeStart = INVALID_CODEPOINT;
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
currIsACharSet = true;
}
else
{
// Just a character
codePointToSet = pendingCodePoint;
pendingCodePoint = deferredCharNode.cs[0];
}
}
else if (nextChar == '-')
{
if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) || ECLookahead(1) == ']')
{
// - is just a char, or end of a range.
// - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
codePointToSet = pendingCodePoint;
pendingCodePoint = '-';
ECConsume();
Expand All @@ -2192,14 +2197,22 @@ namespace UnifiedRegex
{
if (pendingRangeStart != INVALID_CODEPOINT)
{
if (pendingRangeStart > pendingCodePoint)
if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
{
//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
Assert(!unicodeFlagPresent);
Fail(JSERR_RegExpBadRange);
}
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
if (prevprevWasACharSet)
{
codePointSet.Set(ctAllocator, '-');
codePointSet.Set(ctAllocator, pendingCodePoint);
}
else
{
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
}
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
}
else
Expand All @@ -2209,6 +2222,9 @@ namespace UnifiedRegex
}

nextChar = ECLookahead();
prevprevWasACharSet = prevWasACharSet;
prevWasACharSet = currIsACharSet;
currIsACharSet = false;
}

if (pendingCodePoint != INVALID_CODEPOINT)
Expand Down
163 changes: 163 additions & 0 deletions test/Regex/characterclass_with_range.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------

WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");

let re = /^[\s-a-z]$/;
let reIgnoreCase = /^[\s-a-z]$/i;
let reUnicode = /^[\s-z]$/u;
let reNoCharClass = /^[a-c-z]$/;

var tests = [
/*No Flag RegExp Tests begin*/
{
name : "Ensure 'a-z' not counted as range",
body : function ()
{
assert.isFalse(re.test("b"));
}
},
{
name : "Ensure 'a' included in set",
body : function ()
{
assert.isTrue(re.test("a"));
}
},
{
name : "Ensure ' ' included in set",
body : function ()
{
assert.isTrue(re.test(" "));
}
},
{
name : "Ensure 'z' included in set",
body : function ()
{
assert.isTrue(re.test("z"));
}
},
{
name : "Ensure '\t' included in set",
body : function ()
{
assert.isTrue(re.test("\t"));
}
},
{
name : "Ensure 'a-z' not counted as range",
body : function ()
{
assert.isFalse(re.test("q"));
}
},
{
name : "Ensure '\' not counted in set",
body : function ()
{
assert.isFalse(re.test("\\"));
}
},
/*No Flag RegExp Tests End*/
/*IgnoreCase Flag RegExp Tests Begin*/
{
name : "Ensure 'O' not included in set",
body : function ()
{
assert.isFalse(reIgnoreCase.test("O"));
}
},
{
name : "Ensure 'A' included in set",
body : function ()
{
assert.isTrue(reIgnoreCase.test("A"));
}
},
{
name : "Ensure ' ' included in set",
body : function ()
{
assert.isTrue(reIgnoreCase.test(" "));
}
},
{
name : "Ensure 'z' included in set",
body : function ()
{
assert.isTrue(reIgnoreCase.test("z"));
}
},
{
name : "Ensure '\t' included in set",
body : function ()
{
assert.isTrue(reIgnoreCase.test("\t"));
}
},
/*IgnoreCase Flag RegExp Tests End*/
/*Unicode Flag RegExp Tests Begin*/
{
name : "'-' included in set since \s-z treated as union of three types, not range",
body : function ()
{
assert.isTrue(reUnicode.test("-"));
}
},
{
name : "' ' in set from \s character set",
body : function ()
{
assert.isTrue(reUnicode.test(" "));
}
},
{
name : "b not included in '\s-z'",
body : function ()
{
assert.isFalse(reUnicode.test("b"));
}
},
/*Unicode Flag RegExp Tests End*/
/*Non-character class tests Begin*/
{
name : "First range is used",
body : function ()
{
assert.isTrue(reNoCharClass.test("b"));
}
},
{
name : "'-' included in set from 2nd dash",
body : function ()
{
assert.isTrue(reNoCharClass.test("-"));
}
},
{
name : "z included in set",
body : function ()
{
assert.isTrue(reNoCharClass.test("z"));
}
},
{
name : "'c-z' not viewed as range",
body : function ()
{
assert.isFalse(reNoCharClass.test("y"));
}
}
/*Non-character class tests End*/
];

if (typeof modifyTests !== "undefined") {
tests = modifyTests(tests);
}

testRunner.runTests(tests, {
verbose : WScript.Arguments[0] != "summary"
});
6 changes: 6 additions & 0 deletions test/Regex/rlexe.xml
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,10 @@
<compile-flags>-args summary -endargs</compile-flags>
</default>
</test>
<test>
<default>
<files>characterclass_with_range.js</files>
<compile-flags>-args summary -endargs</compile-flags>
</default>
</test>
</regress-exe>

0 comments on commit 7e5f990

Please sign in to comment.