Skip to content

Commit

Permalink
Enable RegexOptions.RightToLeft and lookbehinds in compiler / source …
Browse files Browse the repository at this point in the history
…generator (#66280)

* Enable RegexOptions.RightToLeft and lookbehinds in compiler / source generator

For .NET 7 we rewrote RegexCompiler as we were writing the source generator, and in doing so we left out support for RegexOptions.RightToLeft as well as lookbehinds (which are implemented via RightToLeft).  This adds support for both.  I initially started incrementally adding in support for various constructs in lookbehinds, but from a testing perspective it made more sense to just add it all, as then all of the RightToLeft tests are used to validate the constructs that are also in lookbehinds.

* Address PR feedback
  • Loading branch information
stephentoub authored Mar 7, 2022
1 parent 30d66a2 commit 457b1ff
Show file tree
Hide file tree
Showing 10 changed files with 1,313 additions and 502 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
return Diagnostic.Create(DiagnosticDescriptors.InvalidLangVersion, methodSyntax.GetLocation());
}

RegexOptions regexOptions = RegexOptions.Compiled | (options is not null ? (RegexOptions)options : RegexOptions.None);
RegexOptions regexOptions = options is not null ? (RegexOptions)options : RegexOptions.None;

// TODO: This is going to include the culture that's current at the time of compilation.
// What should we do about that? We could:
Expand Down Expand Up @@ -181,7 +181,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M
RegexTree tree;
try
{
tree = RegexParser.Parse(pattern, regexOptions, culture);
tree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it
}
catch (Exception e)
{
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options, CultureInfo

// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
// for the whole expression, we can use that to quickly jump to the right location in the input.
if (!_rightToLeft) // haven't added FindNextStartingPositionMode support for RTL
if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
{
bool triedToComputeMaxLength = false;

Expand Down Expand Up @@ -297,16 +297,18 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
// For others, we can jump to the relevant location.

case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
if (pos > 0)
if (pos != 0)
{
// If we're not currently at the beginning, we'll never be, so fail immediately.
pos = textSpan.Length;
return false;
}
return true;

case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
if (pos > start)
if (pos != start)
{
// If we're not currently at the start, we'll never be, so fail immediately.
pos = textSpan.Length;
return false;
}
Expand All @@ -315,27 +317,38 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
if (pos < textSpan.Length - 1)
{
// If we're not currently at the end (or a newline just before it), skip ahead
// since nothing until then can possibly match.
pos = textSpan.Length - 1;
}
return true;

case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End:
if (pos < textSpan.Length)
{
// If we're not currently at the end (or a newline just before it), skip ahead
// since nothing until then can possibly match.
pos = textSpan.Length;
}
return true;

case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
if (pos > 0)
if (pos != 0)
{
// If we're not currently at the beginning, skip ahead (or, rather, backwards)
// since nothing until then can possibly match. (We're iterating from the end
// to the beginning in RightToLeft mode.)
pos = 0;
}
return true;

case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
if (pos < start)
if (pos != start)
{
// If we're not currently at the starting position, we'll never be, so fail immediately.
// This is different from beginning, since beginning is the fixed location of 0 whereas
// start is wherever the iteration actually starts from; in left-to-right, that's often
// the same as beginning, but in RightToLeft it rarely is.
pos = 0;
return false;
}
Expand All @@ -344,6 +357,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
{
// If we're not currently at the end, we'll never be (we're iterating from end to beginning),
// so fail immediately.
pos = 0;
return false;
}
Expand All @@ -352,6 +367,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos
case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
if (pos < textSpan.Length)
{
// If we're not currently at the end, we'll never be (we're iterating from end to beginning),
// so fail immediately.
pos = 0;
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler
EmitTryMatchAtCurrentPosition();

DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan<char>) });
EmitScan(tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod);
EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod);

return new CompiledRegexRunnerFactory(scanMethod);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2558,39 +2558,46 @@ public int ChildCount()
// there's no need to localize).
internal bool SupportsCompilation([NotNullWhen(false)] out string? reason)
{
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
reason = "run-time limits were exceeded";
return false;
}

// NonBacktracking isn't supported, nor RightToLeft. The latter applies to both the top-level
// options as well as when used to specify positive and negative lookbehinds.
if ((Options & RegexOptions.NonBacktracking) != 0)
{
reason = "RegexOptions.NonBacktracking was specified";
reason = "RegexOptions.NonBacktracking isn't supported";
return false;
}

if ((Options & RegexOptions.RightToLeft) != 0)
if (ExceedsMaxDepthAllowedDepth(this, allowedDepth: 40))
{
reason = "RegexOptions.RightToLeft or a positive/negative lookbehind was used";
// For the source generator, deep RegexNode trees can result in emitting C# code that exceeds C# compiler
// limitations, leading to "CS8078: An expression is too long or complex to compile". As such, we place
// an artificial limit on max tree depth in order to mitigate such issues. The allowed depth can be tweaked
// as needed; its exceedingly rare to find expressions with such deep trees. And while RegexCompiler doesn't
// have to deal with C# compiler limitations, we still want to limit max tree depth as we want to limit
// how deep recursion we'll employ as part of code generation.
reason = "the expression may result exceeding run-time or compiler limits";
return false;
}

int childCount = ChildCount();
for (int i = 0; i < childCount; i++)
// Supported.
reason = null;
return true;

static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth)
{
// The node isn't supported if any of its children aren't supported.
if (!Child(i).SupportsCompilation(out reason))
if (allowedDepth <= 0)
{
return false;
return true;
}
}

// Supported.
reason = null;
return true;
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
if (ExceedsMaxDepthAllowedDepth(node.Child(i), allowedDepth - 1))
{
return true;
}
}

return false;
}
}

/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA
return false;
}

// Track whether we've seen any node with IgnoreCase set.
// Track whether we've seen any nodes with various options set.
results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0;
results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0;

if (isAtomicByAncestor)
{
Expand Down Expand Up @@ -149,6 +150,8 @@ internal sealed class AnalysisResults
internal HashSet<RegexNode>? _mayBacktrack;
/// <summary>Whether any node has <see cref="RegexOptions.IgnoreCase"/> set.</summary>
internal bool _hasIgnoreCase;
/// <summary>Whether any node has <see cref="RegexOptions.RightToLeft"/> set.</summary>
internal bool _hasRightToLeft;

/// <summary>Initializes the instance.</summary>
/// <param name="regexTree">The code being analyzed.</param>
Expand All @@ -158,23 +161,48 @@ internal sealed class AnalysisResults
public RegexTree RegexTree { get; }

/// <summary>Gets whether a node is considered atomic based on its ancestry.</summary>
/// <remarks>
/// If the whole tree couldn't be examined, this returns false. That could lead to additional
/// code being output as nodes that could have been made atomic aren't, but functionally it's
/// the safe choice.
/// </remarks>
public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node);

/// <summary>Gets whether a node directly or indirectly contains captures.</summary>
/// <remarks>
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
/// code being emitted to deal with captures that can't occur, but functionally it's the
/// safe choice.
/// </remarks>
public bool MayContainCapture(RegexNode node) => !_complete || _containsCapture.Contains(node);

/// <summary>Gets whether a node is or directory or indirectly contains a backtracking construct that isn't hidden by an internal atomic construct.</summary>
/// <remarks>
/// In most code generation situations, we only need to know after we emit the child code whether
/// the child may backtrack, and that we can see with 100% certainty. This method is useful in situations
/// where we need to predict without traversing the child at code generation time whether it may
/// incur backtracking. This method may have (few) false positives, but won't have any false negatives,
/// incur backtracking. This method may have (few) false positives (return true when it could have
/// returned false), but won't have any false negatives (return false when it should have returned true),
/// meaning it might claim a node requires backtracking even if it doesn't, but it will always return
/// true for any node that requires backtracking.
/// true for any node that requires backtracking. In that vein, if the whole tree couldn't be examined,
/// this returns true.
/// </remarks>
public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false);

/// <summary>Gets whether a node might have <see cref="RegexOptions.IgnoreCase"/> set.</summary>
public bool HasIgnoreCase => _complete && _hasIgnoreCase;
/// <remarks>
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
/// code being emitted to support case-insensitivity in expressions that don't actually need
/// it, but functionally it's the safe choice.
/// </remarks>
public bool HasIgnoreCase => !_complete || _hasIgnoreCase;

/// <summary>Gets whether a node might have <see cref="RegexOptions.RightToLeft"/> set.</summary>
/// <remarks>
/// If the whole tree couldn't be examined, this returns true. That could lead to additional
/// code being emitted to support expressions that don't actually contain any RightToLeft
/// nodes, but functionally it's the safe choice.
/// </remarks>
public bool HasRightToLeft => !_complete || _hasRightToLeft;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,36 @@ public static IEnumerable<object[]> Match_MemberData()

if (!RegexHelpers.IsNonBacktracking(engine))
{
// Zero-width negative lookahead assertion: Actual - "abc(?!XXX)\\w+"
// Zero-width negative lookahead assertion
yield return (@"abc(?!XXX)\w+", "abcXXXdef", RegexOptions.None, 0, 9, false, string.Empty);

// Zero-width positive lookbehind assertion: Actual - "(\\w){6}(?<=XXX)def"
// Zero-width positive lookbehind assertion
yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef");
yield return (@"(?<=c)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
yield return (@"(?<=abc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
yield return (@"(?<=a\wc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def");
yield return (@"(?<=\ba\wc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
yield return (@"(?<=.\ba\wc\B)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def");
yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.Multiline, 0, 10, true, "def");
yield return (@"(?<=123$\nabc)def", "123\nabcdef", RegexOptions.Multiline, 0, 10, true, "def");
yield return (@"123(?<!$) abcdef", "123 abcdef", RegexOptions.None, 0, 10, true, "123 abcdef");
yield return (@"\w{3}(?<=^xyz|^abc)defg", "abcdefg", RegexOptions.None, 0, 7, true, "abcdefg");
yield return (@"(abc)\w(?<=(?(1)d|e))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
yield return (@"(abc)\w(?<!(?(1)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
yield return (@"(abc)\w(?<=(?(cd)d|e))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
yield return (@"(abc)\w(?<!(?(cd)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
yield return (@"\w{3}(?<=(\w){6,8})", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "def");
yield return (@"\w{3}(?<=(?:\d\w){4})", "a1b2c3d4e5", RegexOptions.None, 0, 10, true, "d4e");
yield return (@"\w{3}(?<=(\w){6,8}?)", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "def");
yield return (@"\w{3}(?<=(?:\d\w){3,4}?\d)", "a1b2c3d4e5", RegexOptions.None, 0, 10, true, "3d4");
yield return (@"(\w{3})\w*(?<=\1)", "---abcdefababc123", RegexOptions.None, 0, 17, true, "abcdefababc");
yield return (@"(?<=\w{3})\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "defg");
yield return (@"(?<=\w{3,8})\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "defg");
yield return (@"(?<=\w*)\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "abcd");
yield return (@"(?<=\w?)\w{4}", "abcdefghijklmnop", RegexOptions.None, 0, 16, true, "abcd");
yield return (@"(?<=\d?)a{4}", "123aaaaaaaaa", RegexOptions.None, 0, 12, true, "aaaa");
yield return (@"(?<=a{3,5}[ab]*)1234", "aaaaaaa1234", RegexOptions.None, 0, 11, true, "1234");

// Zero-width negative lookbehind assertion: Actual - "(\\w){6}(?<!XXX)def"
yield return (@"(\w){6}(?<!XXX)def", "XXXabcdef", RegexOptions.None, 0, 9, true, "XXXabcdef");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,21 +197,6 @@ partial class C
Assert.Equal("SYSLIB1044", Assert.Single(diagnostics).Id);
}

[Fact]
public async Task Diagnostic_RightToLeft_LimitedSupport()
{
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
using System.Text.RegularExpressions;
partial class C
{
[RegexGenerator(""ab"", RegexOptions.RightToLeft)]
private static partial Regex RightToLeftNotSupported();
}
");

Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
}

[Fact]
public async Task Diagnostic_NonBacktracking_LimitedSupport()
{
Expand All @@ -227,36 +212,6 @@ partial class C
Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
}

[Fact]
public async Task Diagnostic_PositiveLookbehind_LimitedSupport()
{
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
using System.Text.RegularExpressions;
partial class C
{
[RegexGenerator(""(?<=\b20)\d{2}\b"")]
private static partial Regex PositiveLookbehindNotSupported();
}
");

Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
}

[Fact]
public async Task Diagnostic_NegativeLookbehind_LimitedSupport()
{
IReadOnlyList<Diagnostic> diagnostics = await RegexGeneratorHelper.RunGenerator(@"
using System.Text.RegularExpressions;
partial class C
{
[RegexGenerator(""(?<!(Saturday|Sunday) )\b\w+ \d{1,2}, \d{4}\b"")]
private static partial Regex NegativeLookbehindNotSupported();
}
");

Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id);
}

[Fact]
public async Task Valid_ClassWithoutNamespace()
{
Expand Down
Loading

0 comments on commit 457b1ff

Please sign in to comment.