diff --git a/lib/Parser/DebugWriter.cpp b/lib/Parser/DebugWriter.cpp
index 7ee268e89bd..4d1caf03958 100644
--- a/lib/Parser/DebugWriter.cpp
+++ b/lib/Parser/DebugWriter.cpp
@@ -72,6 +72,8 @@ namespace UnifiedRegex
CheckForNewline();
if (c > 0xff)
Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
+ else if (c == '-')
+ Output::Print(_u("\\x2d"));
else if (c < ' ' || c > '~')
Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
else
diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp
index 619a4f6e89e..692527de0c3 100644
--- a/lib/Parser/RegexParser.cpp
+++ b/lib/Parser/RegexParser.cpp
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
bool previousSurrogatePart = false;
+
while(nextChar != ']')
{
current = next;
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
lastCodepoint = INVALID_CODEPOINT;
}
- // If we the next character is the end of range ']', then we can't have a surrogate pair.
+ // If the next character is the end of range ']', then we can't have a surrogate pair.
// The current character is the range end, if we don't already have a candidate.
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
{
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
EncodedChar nextChar = ECLookahead();
bool previousWasASurrogate = false;
+ bool currIsACharSet = false;
+ bool prevWasACharSet = false;
+ bool prevprevWasACharSet = false;
+
while(nextChar != ']')
{
codepoint_t codePointToSet = INVALID_CODEPOINT;
@@ -2147,30 +2152,30 @@ namespace UnifiedRegex
else if (nextChar == '\\')
{
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
+ codePointToSet = pendingCodePoint;
if (returnedNode->tag == Node::MatchSet)
{
- codePointToSet = pendingCodePoint;
- pendingCodePoint = INVALID_CODEPOINT;
+ pendingCodePoint = nextChar;
if (pendingRangeStart != INVALID_CODEPOINT)
{
codePointSet.Set(ctAllocator, '-');
}
pendingRangeStart = INVALID_CODEPOINT;
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
+ currIsACharSet = true;
}
else
{
// Just a character
- codePointToSet = pendingCodePoint;
pendingCodePoint = deferredCharNode.cs[0];
}
}
else if (nextChar == '-')
{
- if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
+ if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) || ECLookahead(1) == ']')
{
- // - is just a char, or end of a range.
+ // - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
codePointToSet = pendingCodePoint;
pendingCodePoint = '-';
ECConsume();
@@ -2192,14 +2197,22 @@ namespace UnifiedRegex
{
if (pendingRangeStart != INVALID_CODEPOINT)
{
- if (pendingRangeStart > pendingCodePoint)
+ if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
{
//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
Assert(!unicodeFlagPresent);
Fail(JSERR_RegExpBadRange);
}
- codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
+ if (prevprevWasACharSet)
+ {
+ codePointSet.Set(ctAllocator, '-');
+ codePointSet.Set(ctAllocator, pendingCodePoint);
+ }
+ else
+ {
+ codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
+ }
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
}
else
@@ -2209,6 +2222,9 @@ namespace UnifiedRegex
}
nextChar = ECLookahead();
+ prevprevWasACharSet = prevWasACharSet;
+ prevWasACharSet = currIsACharSet;
+ currIsACharSet = false;
}
if (pendingCodePoint != INVALID_CODEPOINT)
diff --git a/test/Regex/characterclass_with_range.js b/test/Regex/characterclass_with_range.js
new file mode 100644
index 00000000000..c9b343b9e5b
--- /dev/null
+++ b/test/Regex/characterclass_with_range.js
@@ -0,0 +1,163 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+let re = /^[\s-a-z]$/;
+let reIgnoreCase = /^[\s-a-z]$/i;
+let reUnicode = /^[\s-z]$/u;
+let reNoCharClass = /^[a-c-z]$/;
+
+var tests = [
+ /*No Flag RegExp Tests begin*/
+ {
+ name : "Ensure 'a-z' not counted as range",
+ body : function ()
+ {
+ assert.isFalse(re.test("b"));
+ }
+ },
+ {
+ name : "Ensure 'a' included in set",
+ body : function ()
+ {
+ assert.isTrue(re.test("a"));
+ }
+ },
+ {
+ name : "Ensure ' ' included in set",
+ body : function ()
+ {
+ assert.isTrue(re.test(" "));
+ }
+ },
+ {
+ name : "Ensure 'z' included in set",
+ body : function ()
+ {
+ assert.isTrue(re.test("z"));
+ }
+ },
+ {
+ name : "Ensure '\t' included in set",
+ body : function ()
+ {
+ assert.isTrue(re.test("\t"));
+ }
+ },
+ {
+ name : "Ensure 'a-z' not counted as range",
+ body : function ()
+ {
+ assert.isFalse(re.test("q"));
+ }
+ },
+ {
+ name : "Ensure '\' not counted in set",
+ body : function ()
+ {
+ assert.isFalse(re.test("\\"));
+ }
+ },
+ /*No Flag RegExp Tests End*/
+ /*IgnoreCase Flag RegExp Tests Begin*/
+ {
+ name : "Ensure 'O' not included in set",
+ body : function ()
+ {
+ assert.isFalse(reIgnoreCase.test("O"));
+ }
+ },
+ {
+ name : "Ensure 'A' included in set",
+ body : function ()
+ {
+ assert.isTrue(reIgnoreCase.test("A"));
+ }
+ },
+ {
+ name : "Ensure ' ' included in set",
+ body : function ()
+ {
+ assert.isTrue(reIgnoreCase.test(" "));
+ }
+ },
+ {
+ name : "Ensure 'z' included in set",
+ body : function ()
+ {
+ assert.isTrue(reIgnoreCase.test("z"));
+ }
+ },
+ {
+ name : "Ensure '\t' included in set",
+ body : function ()
+ {
+ assert.isTrue(reIgnoreCase.test("\t"));
+ }
+ },
+ /*IgnoreCase Flag RegExp Tests End*/
+ /*Unicode Flag RegExp Tests Begin*/
+ {
+ name : "'-' included in set since \s-z treated as union of three types, not range",
+ body : function ()
+ {
+ assert.isTrue(reUnicode.test("-"));
+ }
+ },
+ {
+ name : "' ' in set from \s character set",
+ body : function ()
+ {
+ assert.isTrue(reUnicode.test(" "));
+ }
+ },
+ {
+ name : "b not included in '\s-z'",
+ body : function ()
+ {
+ assert.isFalse(reUnicode.test("b"));
+ }
+ },
+ /*Unicode Flag RegExp Tests End*/
+ /*Non-character class tests Begin*/
+ {
+ name : "First range is used",
+ body : function ()
+ {
+ assert.isTrue(reNoCharClass.test("b"));
+ }
+ },
+ {
+ name : "'-' included in set from 2nd dash",
+ body : function ()
+ {
+ assert.isTrue(reNoCharClass.test("-"));
+ }
+ },
+ {
+ name : "z included in set",
+ body : function ()
+ {
+ assert.isTrue(reNoCharClass.test("z"));
+ }
+ },
+ {
+ name : "'c-z' not viewed as range",
+ body : function ()
+ {
+ assert.isFalse(reNoCharClass.test("y"));
+ }
+ }
+ /*Non-character class tests End*/
+];
+
+if (typeof modifyTests !== "undefined") {
+ tests = modifyTests(tests);
+}
+
+testRunner.runTests(tests, {
+ verbose : WScript.Arguments[0] != "summary"
+});
diff --git a/test/Regex/rlexe.xml b/test/Regex/rlexe.xml
index 6d957a49b5a..775d059f564 100644
--- a/test/Regex/rlexe.xml
+++ b/test/Regex/rlexe.xml
@@ -229,4 +229,10 @@
-args summary -endargs
+
+
+ characterclass_with_range.js
+ -args summary -endargs
+
+