Skip to content

Commit

Permalink
Extend regex switch alternation optimization to IgnoreCase (#63756)
Browse files Browse the repository at this point in the history
IgnoreCase now results in producing sets (e.g. an 'a' becomes '[Aa]') but the source generator's optimization that produces a switch statements for alternations with non-overlapping branches doesn't yet understand such sets.  This augments that logic to fix that.
  • Loading branch information
stephentoub authored Jan 19, 2022
1 parent a25536f commit 777c353
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -916,24 +916,69 @@ void EmitAlternation(RegexNode node)
}
}

// Detect whether every branch begins with one or more unique characters.
const int SetCharsSize = 5; // arbitrary limit (for IgnoreCase, we want this to be at least 3 to handle the vast majority of values)
Span<char> setChars = stackalloc char[SetCharsSize];
if (useSwitchedBranches)
{
// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
// starting characters are unique relative to all the branches.
var seenChars = new HashSet<char>();
for (int i = 0; i < childCount; i++)
for (int i = 0; i < childCount && useSwitchedBranches; i++)
{
if (node.Child(i).FindBranchOneOrMultiStart() is not RegexNode oneOrMulti ||
!seenChars.Add(oneOrMulti.FirstCharOfOneOrMulti()))
// If it's not a One, Multi, or Set, we can't apply this optimization.
// If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple.
if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet ||
(oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048
{
useSwitchedBranches = false;
break;
}

// If it's a One or a Multi, get the first character and add it to the set.
// If it was already in the set, we can't apply this optimization.
if (oneMultiOrSet.Type is RegexNode.One or RegexNode.Multi)
{
if (!seenChars.Add(oneMultiOrSet.FirstCharOfOneOrMulti()))
{
useSwitchedBranches = false;
break;
}
}
else
{
// The branch begins with a set. Make sure it's a set of only a few characters
// and get them. If we can't, we can't apply this optimization.
Debug.Assert(oneMultiOrSet.Type is RegexNode.Set);
int numChars;
if (RegexCharClass.IsNegated(oneMultiOrSet.Str!) ||
(numChars = RegexCharClass.GetSetChars(oneMultiOrSet.Str!, setChars)) == 0)
{
useSwitchedBranches = false;
break;
}

// Check to make sure each of the chars is unique relative to all other branches examined.
foreach (char c in setChars.Slice(0, numChars))
{
if (!seenChars.Add(c))
{
useSwitchedBranches = false;
break;
}
}
}
}
}

if (useSwitchedBranches)
{
// Note: This optimization does not exist with RegexOptions.Compiled. Here we rely on the
// C# compiler to lower the C# switch statement with appropriate optimizations.
// C# compiler to lower the C# switch statement with appropriate optimizations. In some
// cases there are enough branches that the compiler will emit a jump table. In others
// it'll optimize the order of checks in order to minimize the total number in the worst
// case. In any case, we get easier to read and reason about C#.
EmitSwitchedBranches();
}
else
Expand All @@ -950,8 +995,9 @@ void EmitSwitchedBranches()
writer.WriteLine();

// Emit a switch statement on the first char of each branch.
using (EmitBlock(writer, $"switch ({ToLowerIfNeeded(hasTextInfo, options, $"{sliceSpan}[{sliceStaticPos++}]", IsCaseInsensitive(node))})"))
using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos++}])"))
{
Span<char> setChars = stackalloc char[SetCharsSize]; // needs to be same size as detection check in caller
int startingSliceStaticPos = sliceStaticPos;

// Emit a case for each branch.
Expand All @@ -960,20 +1006,31 @@ void EmitSwitchedBranches()
sliceStaticPos = startingSliceStaticPos;

RegexNode child = node.Child(i);
Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Concatenate, DescribeNode(child, rm.Code));
Debug.Assert(child.Type is not RegexNode.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Type is RegexNode.One or RegexNode.Multi));
Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Set or RegexNode.Concatenate, DescribeNode(child, rm.Code));
Debug.Assert(child.Type is not RegexNode.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Type is RegexNode.One or RegexNode.Multi or RegexNode.Set));

RegexNode? childStart = child.FindBranchOneOrMultiStart();
Debug.Assert(childStart is not null, DescribeNode(child, rm.Code));
RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts");

writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:");
if (childStart.Type is RegexNode.Set)
{
int numChars = RegexCharClass.GetSetChars(childStart.Str!, setChars);
Debug.Assert(numChars != 0);
writer.WriteLine($"case {string.Join(" or ", setChars.Slice(0, numChars).ToArray().Select(c => Literal(c)))}:");
}
else
{
writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:");
}
writer.Indent++;

// Emit the code for the branch, without the first character that was already matched in the switch.
switch (child.Type)
{
case RegexNode.Multi:
EmitNode(CloneMultiWithoutFirstChar(child));
writer.WriteLine();
break;

case RegexNode.Concatenate:
Expand All @@ -988,16 +1045,17 @@ void EmitSwitchedBranches()
newConcat.AddChild(child.Child(j));
}
EmitNode(newConcat.Reduce());
writer.WriteLine();
break;

static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Multi);
Debug.Assert(node.Str!.Length >= 2);
return node.Str!.Length == 2 ?
new RegexNode(RegexNode.One, node.Options, node.Str![1]) :
new RegexNode(RegexNode.Multi, node.Options, node.Str!.Substring(1));
}
static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
{
Debug.Assert(node.Type is RegexNode.Multi);
Debug.Assert(node.Str!.Length >= 2);
return node.Str!.Length == 2 ?
new RegexNode(RegexNode.One, node.Options, node.Str![1]) :
new RegexNode(RegexNode.Multi, node.Options, node.Str!.Substring(1));
}
}

// This is only ever used for atomic alternations, so we can simply reset the doneLabel
Expand All @@ -1009,7 +1067,6 @@ static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
// Before jumping to the end, we need to zero out sliceStaticPos, so that no
// matter what the value is after the branch, whatever follows the alternate
// will see the same sliceStaticPos.
writer.WriteLine();
TransferSliceStaticPosToPos();
writer.WriteLine($"break;");
writer.WriteLine();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,9 +716,8 @@ public static bool TryGetSingleRange(string set, out char lowInclusive, out char
/// </remarks>
public static int GetSetChars(string set, Span<char> chars)
{
// If the set is negated, it's likely to contain a large number of characters,
// so we don't even try. We also get the characters by enumerating the set
// portion, so we validate that it's set up to enable that, e.g. no categories.
// We get the characters by enumerating the set portion, so we validate that it's
// set up to enable that, e.g. no categories.
if (!CanEasilyEnumerateSetContents(set))
{
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1386,14 +1386,15 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan<char> startingSpan)
/// </summary>
public RegexNode? FindBranchOneOrMultiStart()
{
RegexNode branch = this;

if (branch.Type == Concatenate)
{
branch = branch.Child(0);
}
RegexNode branch = Type == Concatenate ? Child(0) : this;
return branch.Type is One or Multi ? branch : null;
}

return branch.Type == One || branch.Type == Multi ? branch : null;
/// <summary>Same as <see cref="FindBranchOneOrMultiStart"/> but also for Sets.</summary>
public RegexNode? FindBranchOneMultiOrSetStart()
{
RegexNode branch = Type == Concatenate ? Child(0) : this;
return branch.Type is One or Multi or Set ? branch : null;
}

/// <summary>Gets the character that begins a One or Multi.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,38 @@ public static IEnumerable<object[]> Match_MemberData()
}

// Alternation construct
foreach (string input in new[] { "abc", "def" })
{
string upper = input.ToUpperInvariant();

// Two branches
yield return (@"abc|def", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|def", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|def", upper, RegexOptions.None, 0, input.Length, false, "");

// Three branches
yield return (@"abc|agh|def", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|agh|def", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|agh|def", upper, RegexOptions.None, 0, input.Length, false, "");

// Four branches
yield return (@"abc|agh|def|aij", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|agh|def|aij", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|agh|def|aij", upper, RegexOptions.None, 0, input.Length, false, "");

// Four branches (containing various other constructs)
if (!RegexHelpers.IsNonBacktracking(engine))
{
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"abc|(agh)|(?=def)def|(?:(?(aij)aij|(?!)))", upper, RegexOptions.None, 0, input.Length, false, "");
}

// Sets in various positions in each branch
yield return (@"a\wc|\wgh|de\w", input, RegexOptions.None, 0, input.Length, true, input);
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.None, 0, input.Length, false, "");
}
yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");
if (!RegexHelpers.IsNonBacktracking(engine))
{
Expand All @@ -446,7 +478,6 @@ public static IEnumerable<object[]> Match_MemberData()
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty);
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd");
}
yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");

// No Negation
yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty);
Expand Down

0 comments on commit 777c353

Please sign in to comment.