Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend regex switch alternation optimization to IgnoreCase #63756

Merged
merged 1 commit into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -855,24 +855,69 @@ void EmitAlternation(RegexNode node)
}
}

// Detect whether every branch begins with one or more unique characters.
const int SetCharsSize = 5; // arbitrary limit (for IgnoreCase, we want this to be at least 3 to handle the vast majority of values)
Span<char> setChars = stackalloc char[SetCharsSize];
if (useSwitchedBranches)
{
// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
// starting characters are unique relative to all the branches.
var seenChars = new HashSet<char>();
for (int i = 0; i < childCount; i++)
for (int i = 0; i < childCount && useSwitchedBranches; i++)
{
if (node.Child(i).FindBranchOneOrMultiStart() is not RegexNode oneOrMulti ||
!seenChars.Add(oneOrMulti.FirstCharOfOneOrMulti()))
// If it's not a One, Multi, or Set, we can't apply this optimization.
// If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple.
if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet ||
(oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048
{
useSwitchedBranches = false;
break;
}

// If it's a One or a Multi, get the first character and add it to the set.
// If it was already in the set, we can't apply this optimization.
if (oneMultiOrSet.Type is RegexNode.One or RegexNode.Multi)
{
if (!seenChars.Add(oneMultiOrSet.FirstCharOfOneOrMulti()))
{
useSwitchedBranches = false;
break;
}
}
else
{
// The branch begins with a set. Make sure it's a set of only a few characters
// and get them. If we can't, we can't apply this optimization.
Debug.Assert(oneMultiOrSet.Type is RegexNode.Set);
int numChars;
if (RegexCharClass.IsNegated(oneMultiOrSet.Str!) ||
(numChars = RegexCharClass.GetSetChars(oneMultiOrSet.Str!, setChars)) == 0)
{
useSwitchedBranches = false;
break;
}

// Check to make sure each of the chars is unique relative to all other branches examined.
foreach (char c in setChars.Slice(0, numChars))
{
if (!seenChars.Add(c))
{
useSwitchedBranches = false;
break;
}
}
}
}
}

if (useSwitchedBranches)
{
// Note: This optimization does not exist with RegexOptions.Compiled. Here we rely on the
// C# compiler to lower the C# switch statement with appropriate optimizations.
// C# compiler to lower the C# switch statement with appropriate optimizations. In some
// cases there are enough branches that the compiler will emit a jump table. In others
// it'll optimize the order of checks in order to minimize the total number in the worst
// case. In any case, we get easier to read and reason about C#.
EmitSwitchedBranches();
}
else
Expand All @@ -889,8 +934,9 @@ void EmitSwitchedBranches()
writer.WriteLine();

// Emit a switch statement on the first char of each branch.
using (EmitBlock(writer, $"switch ({ToLowerIfNeeded(hasTextInfo, options, $"{sliceSpan}[{sliceStaticPos++}]", IsCaseInsensitive(node))})"))
using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos++}])"))
{
Span<char> setChars = stackalloc char[SetCharsSize]; // needs to be same size as detection check in caller
int startingSliceStaticPos = sliceStaticPos;

// Emit a case for each branch.
Expand All @@ -899,20 +945,31 @@ void EmitSwitchedBranches()
sliceStaticPos = startingSliceStaticPos;

RegexNode child = node.Child(i);
Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Concatenate, DescribeNode(child, rm.Code));
Debug.Assert(child.Type is not RegexNode.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Type is RegexNode.One or RegexNode.Multi));
Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Set or RegexNode.Concatenate, DescribeNode(child, rm.Code));
Debug.Assert(child.Type is not RegexNode.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Type is RegexNode.One or RegexNode.Multi or RegexNode.Set));

RegexNode? childStart = child.FindBranchOneOrMultiStart();
Debug.Assert(childStart is not null, DescribeNode(child, rm.Code));
RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts");

writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:");
if (childStart.Type is RegexNode.Set)
{
int numChars = RegexCharClass.GetSetChars(childStart.Str!, setChars);
Debug.Assert(numChars != 0);
writer.WriteLine($"case {string.Join(" or ", setChars.Slice(0, numChars).ToArray().Select(c => Literal(c)))}:");
danmoseley marked this conversation as resolved.
Show resolved Hide resolved
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
}
else
{
writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:");
}
writer.Indent++;

// Emit the code for the branch, without the first character that was already matched in the switch.
switch (child.Type)
{
case RegexNode.Multi:
EmitNode(CloneMultiWithoutFirstChar(child));
writer.WriteLine();
break;

case RegexNode.Concatenate:
Expand All @@ -927,16 +984,17 @@ void EmitSwitchedBranches()
newConcat.AddChild(child.Child(j));
}
EmitNode(newConcat.Reduce());
writer.WriteLine();
break;

static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Multi);
Debug.Assert(node.Str!.Length >= 2);
return node.Str!.Length == 2 ?
new RegexNode(RegexNode.One, node.Options, node.Str![1]) :
new RegexNode(RegexNode.Multi, node.Options, node.Str!.Substring(1));
}
static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Multi);
Debug.Assert(node.Str!.Length >= 2);
return node.Str!.Length == 2 ?
new RegexNode(RegexNode.One, node.Options, node.Str![1]) :
new RegexNode(RegexNode.Multi, node.Options, node.Str!.Substring(1));
}
}

// This is only ever used for atomic alternations, so we can simply reset the doneLabel
Expand All @@ -948,7 +1006,6 @@ static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
// Before jumping to the end, we need to zero out sliceStaticPos, so that no
// matter what the value is after the branch, whatever follows the alternate
// will see the same sliceStaticPos.
writer.WriteLine();
TransferSliceStaticPosToPos();
writer.WriteLine($"break;");
writer.WriteLine();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,9 +716,8 @@ public static bool TryGetSingleRange(string set, out char lowInclusive, out char
/// </remarks>
public static int GetSetChars(string set, Span<char> chars)
{
// If the set is negated, it's likely to contain a large number of characters,
// so we don't even try. We also get the characters by enumerating the set
// portion, so we validate that it's set up to enable that, e.g. no categories.
// We get the characters by enumerating the set portion, so we validate that it's
// set up to enable that, e.g. no categories.
if (!CanEasilyEnumerateSetContents(set))
{
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1360,14 +1360,15 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan<char> startingSpan)
/// </summary>
public RegexNode? FindBranchOneOrMultiStart()
{
RegexNode branch = this;

if (branch.Type == Concatenate)
{
branch = branch.Child(0);
}
RegexNode branch = Type == Concatenate ? Child(0) : this;
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
return branch.Type is One or Multi ? branch : null;
}

return branch.Type == One || branch.Type == Multi ? branch : null;
/// <summary>Same as <see cref="FindBranchOneOrMultiStart"/> but also for Sets.</summary>
public RegexNode? FindBranchOneMultiOrSetStart()
{
RegexNode branch = Type == Concatenate ? Child(0) : this;
return branch.Type is One or Multi or Set ? branch : null;
}

/// <summary>Gets the character that begins a One or Multi.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,38 @@ public static IEnumerable<object[]> Match_MemberData()
}

// Alternation construct
foreach (string input in new[] { "abc", "def" })
{
string upper = input.ToUpperInvariant();

// Two branches
yield return (@"abc|def", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|def", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|def", upper, RegexOptions.None, 0, input.Length, false, "");

// Three branches
yield return (@"abc|agh|def", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|agh|def", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|agh|def", upper, RegexOptions.None, 0, input.Length, false, "");

// Four branches
yield return (@"abc|agh|def|aij", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|agh|def|aij", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|agh|def|aij", upper, RegexOptions.None, 0, input.Length, false, "");

// Four branches (containing various other constructs)
if (!RegexHelpers.IsNonBacktracking(engine))
{
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", input, RegexOptions.None, 0, input.Length, true, input);
danmoseley marked this conversation as resolved.
Show resolved Hide resolved
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", upper, RegexOptions.None, 0, input.Length, false, "");
}

// Sets in various positions in each branch
yield return (@"a\wc|\wgh|de\w", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.None, 0, input.Length, false, "");
}
yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");
if (!RegexHelpers.IsNonBacktracking(engine))
{
Expand All @@ -373,7 +405,6 @@ public static IEnumerable<object[]> Match_MemberData()
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty);
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd");
}
yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");

// No Negation
yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty);
Expand Down