diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index f4b82bcc2562f..cf70fe9a1a5c6 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -215,17 +215,22 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" {{"); // Main implementation methods - writer.WriteLine($" protected override void InitTrackCount() => base.runtrackcount = {rm.Code.TrackCount};"); - writer.WriteLine(); - writer.WriteLine(" // Description:"); DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", analysis); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override bool FindFirstChar()"); + writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan text)"); + writer.WriteLine($" {{"); + writer.Indent += 4; + EmitScan(writer, rm, id); + writer.Indent -= 4; + writer.WriteLine($" }}"); + writer.WriteLine(); + + writer.WriteLine($" private bool TryFindNextPossibleStartingPosition(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id); + RequiredHelperFunctions requiredHelpers = EmitTryFindNextPossibleStartingPosition(writer, rm, id); writer.Indent -= 4; writer.WriteLine($" }}"); writer.WriteLine(); @@ -233,10 +238,10 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" protected override void Go()"); + writer.WriteLine($" private bool TryMatchAtCurrentPosition(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - requiredHelpers |= EmitGo(writer, rm, id, analysis); + requiredHelpers |= EmitTryMatchAtCurrentPosition(writer, rm, id, analysis); writer.Indent -= 4; writer.WriteLine($" }}"); @@ -271,6 +276,41 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" }}"); } + if ((requiredHelpers & RequiredHelperFunctions.IsBoundary) != 0) + { + writer.WriteLine(); + writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); + writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); + writer.WriteLine($" private static bool IsBoundary(global::System.ReadOnlySpan inputSpan, int index)"); + writer.WriteLine($" {{"); + writer.WriteLine($" int indexM1 = index - 1;"); + writer.WriteLine($" return ((uint)indexM1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexM1])) !="); + writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));"); + writer.WriteLine(); + writer.WriteLine($" static bool IsBoundaryWordChar(char ch) =>"); + writer.WriteLine($" IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');"); + writer.WriteLine($" }}"); + } + + if ((requiredHelpers & RequiredHelperFunctions.IsECMABoundary) != 0) + { + writer.WriteLine(); + writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); + writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); + writer.WriteLine($" private static bool IsECMABoundary(global::System.ReadOnlySpan inputSpan, int index)"); + writer.WriteLine($" {{"); + writer.WriteLine($" int indexM1 = index - 1;"); + writer.WriteLine($" return ((uint)indexM1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexM1])) !="); + writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));"); + writer.WriteLine(); + writer.WriteLine($" static bool IsECMAWordChar(char ch) =>"); + writer.WriteLine($" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter"); + writer.WriteLine($" (((uint)ch - '0') < 10) || // digit"); + writer.WriteLine($" ch == '_' || // underscore"); + writer.WriteLine($" ch == '\\u0130'; // latin capital letter I with dot above"); + writer.WriteLine($" }}"); + } + writer.WriteLine($" }}"); writer.WriteLine($" }}"); writer.WriteLine("}"); @@ -299,8 +339,30 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) } } - /// Emits the body of the FindFirstChar override. - private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id) + /// Emits the body of the Scan method override. + private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) + { + using (EmitBlock(writer, "while (TryFindNextPossibleStartingPosition(text))")) + { + if (rm.MatchTimeout != Timeout.Infinite) + { + writer.WriteLine("base.CheckTimeout();"); + writer.WriteLine(); + } + + writer.WriteLine("// If we find a match on the current position, or we have reached the end of the input, we are done."); + using (EmitBlock(writer, "if (TryMatchAtCurrentPosition(text) || base.runtextpos == text.Length)")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("base.runtextpos++;"); + } + } + + /// Emits the body of the TryFindNextPossibleStartingPosition. + private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, string id) { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; @@ -347,7 +409,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); break; @@ -356,13 +417,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitFixedSet(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitLiteralAfterAtomicLoop(); break; @@ -392,7 +451,7 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ // searching is required; otherwise, false. bool EmitAnchors() { - // Anchors that fully implement FindFirstChar, with a check that leads to immediate success or failure determination. + // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. switch (code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: @@ -463,7 +522,6 @@ bool EmitAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. writer.WriteLine("// Beginning-of-line anchor"); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); additionalDeclarations.Add("int beginning = base.runtextbeg;"); using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')")) { @@ -710,8 +768,8 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or } } - /// Emits the body of the Go override. - private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMethod rm, string id, AnalysisResults analysis) + /// Emits the body of the TryMatchAtCurrentPosition. + private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, string id, AnalysisResults analysis) { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -730,7 +788,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, - // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated @@ -752,17 +810,18 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); node = node.Child(0); - // In some limited cases, FindFirstChar will only return true if it successfully matched the whole expression. - // We can special case these to do essentially nothing in Go other than emit the capture. + // In some limited cases, TryFindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. + // We can special case these to do essentially nothing in TryMatchAtCurrentPosition other than emit the capture. switch (node.Kind) { case RegexNodeKind.Multi or RegexNodeKind.Notone or RegexNodeKind.One or RegexNodeKind.Set when !IsCaseInsensitive(node): // This is the case for single and multiple characters, though the whole thing is only guaranteed - // to have been validated in FindFirstChar when doing case-sensitive comparison. + // to have been validated in TryFindNextPossibleStartingPosition when doing case-sensitive comparison. writer.WriteLine($"int start = base.runtextpos;"); writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); writer.WriteLine("base.runtextpos = end;"); + writer.WriteLine("return true;"); return requiredHelpers; case RegexNodeKind.Empty: @@ -770,6 +829,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // source generator and seeing what happens as you add more to expressions. When approaching // it from a learning perspective, this is very common, as it's the empty string you start with. writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);"); + writer.WriteLine("return true;"); return requiredHelpers; } @@ -781,7 +841,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // Declare some locals. string sliceSpan = "slice"; - writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); @@ -825,7 +884,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe } writer.WriteLine("base.runtextpos = pos;"); writer.WriteLine("base.Capture(0, original_pos, pos);"); - writer.WriteLine("return;"); + writer.WriteLine("return true;"); writer.WriteLine(); // We only get here in the code if the whole expression fails to match and jumps to @@ -836,6 +895,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe { EmitUncaptureUntil("0"); } + writer.WriteLine("return false;"); // We're done with the match. @@ -845,8 +905,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // And emit any required helpers. if (additionalLocalFunctions.Count != 0) { - writer.WriteLine("return;"); // not strictly necessary, just for readability - foreach (KeyValuePair localFunctions in additionalLocalFunctions.OrderBy(k => k.Key)) { writer.WriteLine(); @@ -2138,13 +2196,22 @@ void EmitBoundary(RegexNode node) string call = node.Kind switch { - RegexNodeKind.Boundary => "!base.IsBoundary", - RegexNodeKind.NonBoundary => "base.IsBoundary", - RegexNodeKind.ECMABoundary => "!base.IsECMABoundary", - _ => "base.IsECMABoundary", + RegexNodeKind.Boundary => "!IsBoundary", + RegexNodeKind.NonBoundary => "IsBoundary", + RegexNodeKind.ECMABoundary => "!IsECMABoundary", + _ => "IsECMABoundary", }; - using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) + RequiredHelperFunctions boundaryFunctionRequired = node.Kind switch + { + RegexNodeKind.Boundary or + RegexNodeKind.NonBoundary => RequiredHelperFunctions.IsBoundary | RequiredHelperFunctions.IsWordChar, // IsBoundary internally uses IsWordChar + _ => RequiredHelperFunctions.IsECMABoundary + }; + + requiredHelpers |= boundaryFunctionRequired; + + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) { writer.WriteLine($"goto {doneLabel};"); } @@ -3827,9 +3894,13 @@ public void Dispose() private enum RequiredHelperFunctions { /// No additional functions are required. - None, + None = 0b0, /// The IsWordChar helper is required. - IsWordChar + IsWordChar = 0b1, + /// The IsBoundary helper is required. + IsBoundary = 0b10, + /// The IsECMABoundary helper is required. + IsECMABoundary = 0b100 } } } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 251d595081072..dd6956d11dc7f 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila public string GroupNameFromNumber(int i) { throw null; } public int GroupNumberFromName(string name) { throw null; } protected void InitializeReferences() { } + public bool IsMatch(System.ReadOnlySpan input) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } public bool IsMatch(string input) { throw null; } public bool IsMatch(string input, int startat) { throw null; } public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } @@ -330,9 +334,9 @@ protected void DoubleCrawl() { } protected void DoubleStack() { } protected void DoubleTrack() { } protected void EnsureStorage() { } - protected abstract bool FindFirstChar(); - protected abstract void Go(); - protected abstract void InitTrackCount(); + protected virtual bool FindFirstChar() { throw null; } + protected virtual void Go() { throw null; } + protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } protected bool IsMatched(int cap) { throw null; } @@ -341,6 +345,7 @@ protected void EnsureStorage() { } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } + protected internal virtual void Scan(System.ReadOnlySpan text) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx index 154f34512ed1e..9e905df551a93 100644 --- a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx +++ b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx @@ -1,5 +1,64 @@  + @@ -254,4 +313,7 @@ balancing group (?<name1-name2>subexpression) or (?'name1-name2' subexpression) - + + Searching an input span using a pre-compiled Regex assembly is not supported. Please use the string overloads or use a newer Regex implementation. + + \ No newline at end of file diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs index 81683e09bec1f..2eee81fadc66f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions /// public class Capture { - internal Capture(string text, int index, int length) + internal Capture(string? text, int index, int length) { Text = text; Index = index; @@ -19,27 +19,38 @@ internal Capture(string text, int index, int length) /// Returns the position in the original string where the first character of captured substring was found. public int Index { get; private protected set; } + /// + /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of + /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices + /// to account for the original text beginning. + /// + /// The original text's beginning offset. + internal void AddBeginningToIndex(int beginning) + { + Index += beginning; + } + /// Returns the length of the captured substring. public int Length { get; private protected set; } /// The original string - internal string Text { get; set; } + internal string? Text { get; set; } /// Gets the captured substring from the input string. /// The substring that is captured by the match. - public string Value => Text.Substring(Index, Length); + public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty; /// Gets the captured span from the input string. /// The span that is captured by the match. - public ReadOnlySpan ValueSpan => Text.AsSpan(Index, Length); + public ReadOnlySpan ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan.Empty; /// Returns the substring that was matched. public override string ToString() => Value; /// The substring to the left of the capture - internal ReadOnlyMemory GetLeftSubstring() => Text.AsMemory(0, Index); + internal ReadOnlyMemory GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory.Empty; /// The substring to the right of the capture - internal ReadOnlyMemory GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length); + internal ReadOnlyMemory GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory.Empty; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index c9e6419fed87c..459ca6ed590a7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -5,20 +5,16 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunner : RegexRunner { - private readonly Action _goMethod; - private readonly Func _findFirstCharMethod; + private readonly ScanDelegate _scanMethod; - public CompiledRegexRunner(Action go, Func findFirstChar, int trackCount) + internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); + + public CompiledRegexRunner(ScanDelegate scan) { - _goMethod = go; - _findFirstCharMethod = findFirstChar; - runtrackcount = trackCount; + _scanMethod = scan; } - protected override void Go() => _goMethod(this); - - protected override bool FindFirstChar() => _findFirstCharMethod(this); - - protected override void InitTrackCount() { } + protected internal override void Scan(ReadOnlySpan text) + => _scanMethod(this, text); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index 958d5cf3dc16f..4a9147e4d363b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -7,25 +7,18 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { - private readonly DynamicMethod _goMethod; - private readonly DynamicMethod _findFirstCharMethod; - private readonly int _trackcount; + private readonly DynamicMethod _scanMethod; - // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed. - private Action? _go; - private Func? _findFirstChar; + // Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed. + private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, int trackcount) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod) { - _goMethod = goMethod; - _findFirstCharMethod = findFirstCharMethod; - _trackcount = trackcount; + _scanMethod = scanMethod; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _go ??= _goMethod.CreateDelegate>(), - _findFirstChar ??= _findFirstCharMethod.CreateDelegate>(), - _trackcount); + _scan ??= _scanMethod.CreateDelegate()); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs index f4b2a7fb2e980..2c34694f1ecaf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs @@ -16,7 +16,7 @@ public class Group : Capture internal int _capcount; internal CaptureCollection? _capcoll; - internal Group(string text, int[] caps, int capcount, string name) + internal Group(string? text, int[] caps, int capcount, string name) : base(text, capcount == 0 ? 0 : caps[(capcount - 1) * 2], capcount == 0 ? 0 : caps[(capcount * 2) - 1]) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 3c67526b40e18..8ae239aac9b3d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -50,7 +50,7 @@ public class Match : Group internal bool _balancing; // whether we've done any balancing with this match. If we // have done balancing, we'll need to do extra work in Tidy(). - internal Match(Regex? regex, int capcount, string text, int begpos, int len, int startpos) : + internal Match(Regex? regex, int capcount, string? text, int begpos, int len, int startpos) : base(text, new int[2], 0, "0") { _regex = regex; @@ -66,7 +66,7 @@ internal Match(Regex? regex, int capcount, string text, int begpos, int len, int /// Returns an empty Match object. public static Match Empty { get; } = new Match(null, 1, string.Empty, 0, 0, 0); - internal void Reset(Regex regex, string text, int textbeg, int textend, int textstart) + internal void Reset(Regex regex, string? text, int textbeg, int textend, int textstart) { _regex = regex; Text = text; @@ -84,6 +84,16 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text _groupcoll?.Reset(); } + /// + /// Returns if this object represents a successful match, and otherwise. + /// + /// + /// The main difference between the public property and this one, is that requires + /// for a to call first, in order to report the correct value, while this API will return + /// the correct value right after a Match gets calculated, meaning that it will return right after + /// + internal bool FoundMatch => _matchcount[0] > 0; + public virtual GroupCollection Groups => _groupcoll ??= new GroupCollection(this, null); /// @@ -94,6 +104,7 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text public Match NextMatch() { Regex? r = _regex; + Debug.Assert(Text != null); return r != null ? r.Run(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; @@ -338,7 +349,7 @@ internal sealed class MatchSparse : Match { private new readonly Hashtable _caps; - internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int begpos, int len, int startpos) : + internal MatchSparse(Regex regex, Hashtable caps, int capcount, string? text, int begpos, int len, int startpos) : base(regex, capcount, text, begpos, len, startpos) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index cd0494c3dc009..4bf6af10683fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -13,6 +13,18 @@ public partial class Regex public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => RegexCache.GetOrAdd(pattern).IsMatch(input); + /// + /// Indicates whether the specified regular expression finds a match in the specified input span. + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => + RegexCache.GetOrAdd(pattern).IsMatch(input); + /// /// Searches the input string for one or more occurrences of the text /// supplied in the pattern parameter with matching options supplied in the options @@ -21,9 +33,39 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + /// + /// Indicates whether the specified regular expression finds a match in the specified input span, using the specified matching options. + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that provide options for matching. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. + /// is not in a valid value. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => + RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// + /// Indicates whether the specified regular expression finds a match in the specified input span, using the specified matching options and time-out interval. + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that provide options for matching. + /// A time-out interval, or to indicate that the method should not time out. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. + /// is not in a valid value or is negative, + /// zero, or greater than approximately 24 days. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => + RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// /// Searches the input string for one or more matches using the previous pattern, /// options, and starting position. @@ -38,6 +80,15 @@ public bool IsMatch(string input) return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; } + /// + /// Indicates whether the regular expression specified in the Regex constructor finds a match in a specified input span. + /// + /// The span to search for a match. + /// if the regular expression finds a match; otherwise, . + /// A time-out ocurred. + public bool IsMatch(ReadOnlySpan input) => + Run(input, UseOptionR() ? input.Length : 0) is null; + /// /// Searches the input string for one or more matches using the previous pattern and options, /// with a new starting position. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index c23f309c9f34d..c07558b20f1c5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -372,7 +372,90 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - return runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + runner.runtext = input; + ReadOnlySpan span = input.AsSpan(beginning, length); + runner.InitializeForScan(this, span, startat - beginning, quick); + + int stoppos = RightToLeft ? 0 : span.Length; + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (runner.runtextstart == stoppos) + { + return RegularExpressions.Match.Empty; + } + + runner.runtextpos += RightToLeft ? -1 : 1; + } + + return InternalPerformScan(quick, input, beginning, runner, span, returnNullIfQuick: true); + } + finally + { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache. + _runner = runner; + } + } + + private static Match? InternalPerformScan(bool quick, string input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfQuick) + { + runner.Scan(span); + + Match? match = runner.runmatch; + Debug.Assert(match is not null); + + // If we got a match, do some cleanup and return it, or return null if quick is true; + if (match.FoundMatch) + { + if (!quick) + { + // We're about to return the Match object. Store the input into it and remove it from the runner. + match.Text = input; + runner.runmatch = null; + } + else if (returnNullIfQuick) + { + match.Text = null; + return null; + } + + match.Tidy(runner.runtextpos); + + // If the passed in beginning was not 0 then we need to adjust the offsets on the match object. + if (beginning != 0) + { + match.AddBeginningToIndex(beginning); + } + + return match; + } + + // We failed to match, so we will return Match.Empty which means we can reuse runmatch object. + // We do however need to clear its Text in case it was set, so as to not keep it alive in some cache. + runner.runmatch!.Text = null; + + return RegularExpressions.Match.Empty; + } + + internal Match? Run(ReadOnlySpan input, int startat) + { + // startat parameter is always either 0 or input.Length since public API for IsMatch doesn't have an overload + // that takes in startat. + Debug.Assert(startat <= input.Length); + + RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); + try + { + runner.InitializeTimeout(internalMatchTimeout); + runner.InitializeForScan(this, input, startat, quick: true); + + runner.Scan(input); + + // If runmatch is null it means that an override of Scan didn't implement it correctly, so we will + // let this null ref since there are lots of ways where you can end up in a erroneous state. + return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty; } finally { @@ -387,10 +470,84 @@ internal void Run(string input, int startat, ref TState state, MatchCall RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.ScanInternal(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + runner.runtext = input; + int runtextpos = startat; + while (true) + { + runner.InitializeForScan(this, input, startat, false); + runner.runtextpos = runtextpos; + + int stoppos = RightToLeft ? 0 : input.Length; + + Match? match = InternalPerformScan(reuseMatchObject, input, 0, runner, input, returnNullIfQuick: false); + Debug.Assert(match is not null); + + // if we got a match, then call the callback function with the match and prepare for next iteration. + if (match.Success) + { + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + + if (!callback(ref state, match)) + { + // If the callback returns false, we're done. + + if (reuseMatchObject) + { + // We're reusing the single match instance, so clear out its text as well. + // We don't do this if we're not reusing instances, as in that case we're + // dropping the whole reference to the match, and we no longer own the instance + // having handed it out to the callback. + match.Text = null; + } + return; + } + + // Now that we've matched successfully, update the starting position to reflect + // the current position, just as Match.NextMatch() would pass in _textpos as textstart. + runtextpos = startat = runner.runtextpos; + + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; + + if (match.Length == 0) + { + if (runner.runtextpos == stoppos) + { + if (reuseMatchObject) + { + // See above comment. + match.Text = null; + } + return; + } + + runtextpos += RightToLeft ? -1 : 1; + } + + // Loop around to perform next match from where we left off. + continue; + } + else + { + // We failed to match at this position. If we're at the stopping point, we're done. + if (runner.runtextpos == stoppos) + { + return; + } + } + } } finally { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache. _runner = runner; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 9edcf1cfae63c..cd77b9271d225 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Reflection; using System.Reflection.Emit; @@ -20,7 +21,6 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runtextendField = RegexRunnerField("runtextend"); private static readonly FieldInfo s_runtextstartField = RegexRunnerField("runtextstart"); private static readonly FieldInfo s_runtextposField = RegexRunnerField("runtextpos"); - private static readonly FieldInfo s_runtextField = RegexRunnerField("runtext"); private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); @@ -29,9 +29,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_isMatchedMethod = RegexRunnerMethod("IsMatched"); private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength"); private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex"); - private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary"); + private static readonly MethodInfo s_isBoundaryMethod = typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int) })!; private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar"); - private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary"); + private static readonly MethodInfo s_isECMABoundaryMethod = typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int) })!; private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos"); private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass"); private static readonly MethodInfo s_checkTimeoutMethod = RegexRunnerMethod("CheckTimeout"); @@ -180,6 +180,9 @@ internal abstract class RegexCompiler /// A macro for _ilg.Emit(OpCodes.Ldarg_0). protected void Ldthis() => _ilg!.Emit(OpCodes.Ldarg_0); + /// A macro for _ilgEmit(OpCodes.Ldarg_1) + private void Ldarg_1() => _ilg!.Emit(OpCodes.Ldarg_1); + /// A macro for Ldthis(); Ldfld(); protected void Ldthisfld(FieldInfo ft) { @@ -187,6 +190,10 @@ protected void Ldthisfld(FieldInfo ft) _ilg!.Emit(OpCodes.Ldfld, ft); } + /// Fetches the address of argument in passed in + /// The position of the argument which address needs to be fetched. + private void Ldarga_s(int position) => _ilg!.Emit(OpCodes.Ldarga_S, position); + /// A macro for Ldthis(); Ldfld(); Stloc(); private void Mvfldloc(FieldInfo ft, LocalBuilder lt) { @@ -271,6 +278,9 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) private void Switch(Label[] table) => _ilg!.Emit(OpCodes.Switch, table); + /// Declares a local bool. + private LocalBuilder DeclareBool() => _ilg!.DeclareLocal(typeof(bool)); + /// Declares a local int. private LocalBuilder DeclareInt32() => _ilg!.DeclareLocal(typeof(int)); @@ -353,8 +363,8 @@ private void CallToLower() } } - /// Generates the implementation for FindFirstChar. - protected void EmitFindFirstChar() + /// Generates the implementation for TryFindNextPossibleStartingPosition. + protected void EmitTryFindNextPossibleStartingPosition() { Debug.Assert(_code != null); _int32LocalsPool?.Clear(); @@ -388,11 +398,10 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Load necessary locals // int pos = base.runtextpos; // int end = base.runtextend; - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; Mvfldloc(s_runtextposField, pos); Mvfldloc(s_runtextendField, end); - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); // Generate length check. If the input isn't long enough to possibly match, fail quickly. @@ -470,7 +479,7 @@ bool GenerateAnchors() { Label label; - // Anchors that fully implement FindFirstChar, with a check that leads to immediate success or failure determination. + // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. switch (_code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: @@ -1016,8 +1025,8 @@ void EmitLiteralAfterAtomicLoop() } } - /// Generates the implementation for Go. - protected void EmitGo() + /// Generates the implementation for TryMatchAtCurrentPosition. + protected void EmitTryMatchAtCurrentPosition() { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -1036,7 +1045,7 @@ protected void EmitGo() // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, - // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. Debug.Assert(_code != null); @@ -1051,16 +1060,16 @@ protected void EmitGo() // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); - // In some limited cases, FindFirstChar will only return true if it successfully matched the whole expression. - // We can special case these to do essentially nothing in Go other than emit the capture. + // In some limited cases, TryFindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. + // We can special case these to do essentially nothing in TryMatchAtCurrentPosition other than emit the capture. switch (node.Kind) { case RegexNodeKind.Multi or RegexNodeKind.Notone or RegexNodeKind.One or RegexNodeKind.Set when !IsCaseInsensitive(node): // This is the case for single and multiple characters, though the whole thing is only guaranteed - // to have been validated in FindFirstChar when doing case-sensitive comparison. + // to have been validated in TryFindNextPossibleStartingPosition when doing case-sensitive comparison. // base.Capture(0, base.runtextpos, base.runtextpos + node.Str.Length); // base.runtextpos = base.runtextpos + node.Str.Length; - // return; + // return true; Ldthis(); Dup(); Ldc(0); @@ -1073,6 +1082,7 @@ protected void EmitGo() Ldc(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1); Add(); Stfld(s_runtextposField); + Ldc(1); Ret(); return; @@ -1097,10 +1107,9 @@ protected void EmitGo() // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant InitializeCultureForGoIfNecessary(); - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; // int end = base.runtextend; - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); Mvfldloc(s_runtextendField, end); @@ -1154,6 +1163,9 @@ protected void EmitGo() Ldloc(originalPos); Ldloc(pos); Call(s_captureMethod); + // return true; + Ldc(1); + Ret(); // If the graph contained captures, undo any remaining to handle failed matches. if (expressionHasCaptures) @@ -1184,7 +1196,8 @@ protected void EmitGo() MarkLabel(originalDoneLabel); } - // return; + // return false; + Ldc(0); Ret(); // Generated code successfully. @@ -2311,16 +2324,15 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected type: {node.Kind}"); - // if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; + // if (!IsBoundary(inputSpan, pos + sliceStaticPos)) goto doneLabel; Ldthis(); + Ldloc(inputSpan); Ldloc(pos); if (sliceStaticPos > 0) { Ldc(sliceStaticPos); Add(); } - Ldthisfld(s_runtextbegField); - Ldloc(end); switch (node.Kind) { case RegexNodeKind.Boundary: @@ -3953,6 +3965,52 @@ void EmitStackPop() } } + protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, DynamicMethod tryMatchAtCurrentPositionMethod) + { + Label returnLabel = DefineLabel(); + + // while (TryFindNextPossibleStartingPosition(text)) + Label whileLoopBody = DefineLabel(); + MarkLabel(whileLoopBody); + Ldthis(); + Ldarg_1(); + Call(tryFindNextStartingPositionMethod); + BrfalseFar(returnLabel); + + if (_hasTimeout) + { + // CheckTimeout(); + Ldthis(); + Call(s_checkTimeoutMethod); + } + + // if (TryMatchAtCurrentPosition(text) || runtextpos == text.length) + // return; + Ldthis(); + Ldarg_1(); + Call(tryMatchAtCurrentPositionMethod); + BrtrueFar(returnLabel); + Ldthisfld(s_runtextposField); + Ldarga_s(1); + Call(s_spanGetLengthMethod); + Ceq(); + BrtrueFar(returnLabel); + + // runtextpos += 1 + Ldthis(); + Ldthisfld(s_runtextposField); + Ldc(1); + Add(); + Stfld(s_runtextposField); + + // End loop body. + BrFar(whileLoopBody); + + // return; + MarkLabel(returnLabel); + Ret(); + } + private void InitializeCultureForGoIfNecessary() { _textInfo = null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 9f615754a808f..1092db83c243f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,15 +324,46 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected override bool FindFirstChar() => - _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); + protected internal override void Scan(ReadOnlySpan text) + { + Debug.Assert(runregex is not null); + Debug.Assert(runtrack is not null); + Debug.Assert(runstack is not null); + Debug.Assert(runcrawl is not null); + + // Configure the additional value to "bump" the position along each time we loop around + // to call TryFindNextStartingPosition again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = text.Length; + if (runregex.RightToLeft) + { + bump = -1; + stoppos = 0; + } + + while (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextbeg, runtextstart, runtextend)) + { + CheckTimeout(); + + if (TryMatchAtCurrentPosition(text) || runtextpos == stoppos) + { + return; + } + + // Reset state for another iteration. + runtrackpos = runtrack.Length; + runstackpos = runstack.Length; + runcrawlpos = runcrawl.Length; + runtextpos += bump; + } + } - protected override void Go() + private bool TryMatchAtCurrentPosition(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); _codepos = 0; int advance = -1; - ReadOnlySpan inputSpan = runtext; while (true) { @@ -354,7 +385,7 @@ protected override void Go() switch (_operator) { case RegexOpcode.Stop: - return; + return runmatch!.FoundMatch; case RegexOpcode.Nothing: break; @@ -711,7 +742,7 @@ protected override void Go() continue; case RegexOpcode.Boundary: - if (!IsBoundary(runtextpos, runtextbeg, runtextend)) + if (!IsBoundary(inputSpan, runtextpos)) { break; } @@ -719,7 +750,7 @@ protected override void Go() continue; case RegexOpcode.NonBoundary: - if (IsBoundary(runtextpos, runtextbeg, runtextend)) + if (IsBoundary(inputSpan, runtextpos)) { break; } @@ -727,7 +758,7 @@ protected override void Go() continue; case RegexOpcode.ECMABoundary: - if (!IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (!IsECMABoundary(inputSpan, runtextpos)) { break; } @@ -735,7 +766,7 @@ protected override void Go() continue; case RegexOpcode.NonECMABoundary: - if (IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (IsECMABoundary(inputSpan, runtextpos)) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 34b7f1b113059..fe467efd05a41 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -24,7 +24,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler private static readonly bool s_includePatternInName = Environment.GetEnvironmentVariable(IncludePatternInNamesEnvVar) == "1"; /// Parameter types for the generated Go and FindFirstChar methods. - private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner) }; + private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner), typeof(ReadOnlySpan) }; /// Id number to use for the next compiled regex. private static int s_regexCount; @@ -52,17 +52,20 @@ internal sealed class RegexLWCGCompiler : RegexCompiler description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner)); - EmitFindFirstChar(); + DynamicMethod tryfindNextPossibleStartPositionMethod = DefineDynamicMethod($"Regex{regexNum}_TryFindNextPossibleStartingPosition{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); + EmitTryFindNextPossibleStartingPosition(); - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); - EmitGo(); + DynamicMethod tryMatchAtCurrentPositionMethod = DefineDynamicMethod($"Regex{regexNum}_TryMatchAtCurrentPosition{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); + EmitTryMatchAtCurrentPosition(); - return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, code.TrackCount); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); + EmitScan(tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); + + return new CompiledRegexRunnerFactory(scanMethod); } /// Begins the definition of a new method (no args) with a specified return value. - private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType) + private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType, Type[] paramTypes) { // We're claiming that these are static methods, but really they are instance methods. // By giving them a parameter which represents "this", we're tricking them into @@ -71,7 +74,7 @@ private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Typ const MethodAttributes Attribs = MethodAttributes.Public | MethodAttributes.Static; const CallingConventions Conventions = CallingConventions.Standard; - var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, s_paramTypes, hostType, skipVisibility: false); + var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, paramTypes, hostType, skipVisibility: false); _ilg = dm.GetILGenerator(); return dm; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 13a9fbf155bb1..13a20bb44c60a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -20,8 +20,12 @@ namespace System.Text.RegularExpressions { public abstract class RegexRunner { - protected internal int runtextbeg; // beginning of text to search - protected internal int runtextend; // end of text to search + protected internal int runtextbeg; // Beginning of text to search. We now always use a sliced span of the input + // from runtextbeg to runtextend, which means that runtextbeg is now always 0 except + // for CompiledToAssembly scenario which works over the original input. + protected internal int runtextend; // End of text to search. Because we now pass in a sliced span of the input into Scan, + // the runtextend will always match the length of that passed in span except for CompileToAssemby + // scenario, which still works over the original input. protected internal int runtextstart; // starting point for search protected internal string? runtext; // text to search @@ -88,25 +92,65 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal virtual void Scan(ReadOnlySpan text) { - this.quick = quick; + // This base implementation is overridden by all of the built-in engines and by all source-generated + // implementations. The only time this should end up being used is if someone is using a Regex-derived + // type created by .NET Framework's Regex.CompileToAssembly, in which case it will have overridden + // FindFirstChar and Go but not Scan (which didn't exist yet). This isn't an officially supported configuration, + // using assemblies built for .NET Framework and targeting .NET Framework surface area against this + // implementation, but we make a best-effort to keep things functional. + string? s = runtext; + + // We can assume that the passed in 'text' span is a slice of the original text input runtext. That said we need to calculate + // what the original beginning was and can't do it by just using the lengths of text and runtext, since we can't guarantee that + // the passed in beginning and length match the size of the original input. We instead use MemoryExtensions Overlaps to find the + // offset in memory between them. We intentionally use s.Overlaps(text) since we want to get a positive value. + s.AsSpan().Overlaps(text, out int beginning); + + // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario + // we require to use the complete input and to use the full string instead. We first test to ensure that the + // passed in span matches the original input by using the original runtextbeg. If that is not the case, + // then it means the user is calling the new span-based APIs using CompiledToAssembly, so we throw NSE + // so as to prevent a lot of unexpected allocations. + if (s == null || text != s.AsSpan(beginning, text.Length)) + { + // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called. + throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly); + } - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) + // If the original beginning wasn't zero, then we have to adjust some of the + // internal fields of RegexRunner to ensure the Precompiled Go and FFC methods + // will continue to work as expected since they work over the original input, as opposed + // to using the sliced span. + if (beginning != 0) { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; + runtextbeg = beginning; + runtextstart += beginning; + runtextend += beginning; } + InternalScan(runregex!, beginning, beginning + text.Length); + } + + /// + /// This method's body is only kept since it is a protected member that could be called by someone outside + /// the assembly. + /// + protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + { + InitializeTimeout(timeout); + + // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text + runtext = text; + + InitializeForScan(regex, text, textstart, quick); + + // InitializeForScan will default runtextstart and runtextend to 0 and length of string + // since it is configured to work over a sliced portion of text so we adjust those values. + runtextstart = textstart; + runtextend = textend; + // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump @@ -118,10 +162,6 @@ protected RegexRunner() { } stoppos = textbeg; } - // Store runtextpos into field, as we may bump it in next check. The remaining arguments - // are stored below once we're past the potential return in the next check. - runtextpos = textstart; - // If previous match was empty or failed, advance by one before matching. if (prevlen == 0) { @@ -133,16 +173,41 @@ protected RegexRunner() { } runtextpos += bump; } - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtext = text; - runtextstart = textstart; - runtextbeg = textbeg; - runtextend = textend; + Match match = InternalScan(regex, textbeg, textend); + runtext = null; //drop reference + + if (match.FoundMatch) + { + if (quick) + { + return null; + } + + runmatch = null; + match.Tidy(runtextpos); + } + else + { + runmatch!.Text = null; + } + + return match; + + } + + private Match InternalScan(Regex regex, int textbeg, int textend) + { + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = textend; + if (regex.RightToLeft) + { + bump = -1; + stoppos = textbeg; + } - // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -151,18 +216,7 @@ protected RegexRunner() { } #endif if (FindFirstChar()) { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } - - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } + CheckTimeout(); // See if there's a match at this position. #if DEBUG @@ -170,22 +224,9 @@ protected RegexRunner() { } #endif Go(); - // If we got a match, we're done. - Match match = runmatch!; - if (match._matchcount[0] > 0) + if (runmatch!.FoundMatch) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - - if (quick) - { - runmatch!.Text = null!; // drop reference - return null; - } - - // Return the match in its canonical form. - runmatch = null; - match.Tidy(runtextpos); - return match; + return runmatch; } // Reset state for another iteration. @@ -197,8 +238,6 @@ protected RegexRunner() { } // We failed to match at this position. If we're at the stopping point, we're done. if (runtextpos == stoppos) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - if (runmatch != null) runmatch.Text = null!; return Match.Empty; } @@ -207,159 +246,91 @@ protected RegexRunner() { } } } - /// Enumerates all of the matches with the specified regex, invoking the callback for each. - /// - /// This optionally repeatedly hands out the same Match instance, updated with new information. - /// should be set to false if the Match object is handed out to user code. - /// - internal void ScanInternal(Regex regex, string text, int textstart, ref TState state, MatchCallback callback, bool reuseMatchObject, TimeSpan timeout) + internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textstart, bool quick) { - quick = false; + // Store remaining arguments into fields now that we're going to start the scan. + // These are referenced by the derived runner. + this.quick = quick; + runregex = regex; + runtextstart = textstart; + runtextbeg = 0; + runtextend = text.Length; + runtextpos = textstart; - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) + if (runmatch is null) { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; + // Use a hashtabled Match object if the capture numbers are sparse + runmatch = runregex!.caps is null ? + new Match(runregex, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart) : + new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart); + } + else + { + runmatch.Reset(runregex!, runtext, runtextbeg, runtextend, runtextstart); } - // Configure the additional value to "bump" the position along each time we loop around - // to call FindFirstChar again, as well as the stopping position for the loop. We generally - // bump by 1 and stop at text.Length, but if we're examining right-to-left, we instead bump - // by -1 and stop at 0. - int bump = 1, stoppos = text.Length; - if (regex.RightToLeft) + // Note we test runcrawl, because it is the last one to be allocated + // If there is an alloc failure in the middle of the three allocations, + // we may still return to reuse this instance, and we want to behave + // as if the allocations didn't occur. + if (runcrawl != null) { - bump = -1; - stoppos = 0; + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl.Length; + return; } - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtextstart = runtextpos = textstart; - runtext = text; - runtextend = text.Length; - runtextbeg = 0; + // Everything above runs once per match. + // Everything below runs once per runner. - // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; - while (true) - { - // Find the next potential location for a match in the input. -#if DEBUG - Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling FindFirstChar at {nameof(runtextbeg)}={runtextbeg}, {nameof(runtextpos)}={runtextpos}, {nameof(runtextend)}={runtextend}"); -#endif - if (FindFirstChar()) - { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } + InitTrackCount(); - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } + int stacksize; + int tracksize = stacksize = runtrackcount * 8; -#if DEBUG - Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling Go at {nameof(runtextpos)}={runtextpos}"); -#endif + if (tracksize < 32) + { + tracksize = 32; + } + if (stacksize < 16) + { + stacksize = 16; + } - // See if there's a match at this position. - Go(); + runtrack = new int[tracksize]; + runtrackpos = tracksize; - // See if we have a match. - Match match = runmatch!; - if (match._matchcount[0] > 0) - { - // Hand it out to the callback in canonical form. - if (!reuseMatchObject) - { - // We're not reusing match objects, so null out our field reference to the instance. - // It'll be recreated the next time one is needed. - runmatch = null; - } - match.Tidy(runtextpos); - initialized = false; - if (!callback(ref state, match)) - { - // If the callback returns false, we're done. - // Drop reference to text to avoid keeping it alive in a cache. - runtext = null!; - if (reuseMatchObject) - { - // We're reusing the single match instance, so clear out its text as well. - // We don't do this if we're not reusing instances, as in that case we're - // dropping the whole reference to the match, and we no longer own the instance - // having handed it out to the callback. - match.Text = null!; - } - return; - } - - // Now that we've matched successfully, update the starting position to reflect - // the current position, just as Match.NextMatch() would pass in _textpos as textstart. - runtextstart = runtextpos; - - // Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; - if (match.Length == 0) - { - if (runtextpos == stoppos) - { - // Drop reference to text to avoid keeping it alive in a cache. - runtext = null!; - if (reuseMatchObject) - { - // See above comment. - match.Text = null!; - } - return; - } - - runtextpos += bump; - } - - // Loop around to perform next match from where we left off. - continue; - } + runstack = new int[stacksize]; + runstackpos = stacksize; - // Ran Go but it didn't find a match. Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; - } + runcrawl = new int[32]; + runcrawlpos = 32; + } - // We failed to match at this position. If we're at the stopping point, we're done. - if (runtextpos == stoppos) + internal void InitializeTimeout(TimeSpan timeout) + { + // Handle timeout argument + _ignoreTimeout = true; + if (Regex.InfiniteMatchTimeout != timeout) + { + ConfigureTimeout(timeout); + + void ConfigureTimeout(TimeSpan timeout) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - if (runmatch != null) - { - runmatch.Text = null!; - } - return; + // We are using Environment.TickCount and not Stopwatch for performance reasons. + // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt + // overflow it will still stay ahead of Environment.TickCount for comparisons made + // in DoCheckTimeout(). + _ignoreTimeout = false; + _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; + _timeoutOccursAt = Environment.TickCount + _timeout; + _timeoutChecksToSkip = TimeoutCheckFrequency; } - - // Bump by one (in whichever direction is appropriate) and loop to go again. - runtextpos += bump; } } - protected void CheckTimeout() + protected internal void CheckTimeout() { if (_ignoreTimeout) return; @@ -385,7 +356,9 @@ private void DoCheckTimeout() if (0 > _timeoutOccursAt && 0 < currentMillis) return; - throw new RegexMatchTimeoutException(runtext!, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); + string input = runtext ?? string.Empty; + + throw new RegexMatchTimeoutException(input, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); } /// @@ -394,77 +367,21 @@ private void DoCheckTimeout() /// then to leave runtextpos at the ending position. It should leave /// runtextpos where it started if there was no match. /// - protected abstract void Go(); + protected virtual void Go() => throw new NotImplementedException(); /// /// The responsibility of FindFirstChar() is to advance runtextpos /// until it is at the next position which is a candidate for the /// beginning of a successful match. /// - protected abstract bool FindFirstChar(); + protected virtual bool FindFirstChar() => throw new NotImplementedException(); /// /// InitTrackCount must initialize the runtrackcount field; this is /// used to know how large the initial runtrack and runstack arrays /// must be. /// - protected abstract void InitTrackCount(); - - /// - /// Initializes all the data members that are used by Go() - /// - private void InitializeForGo() - { - if (runmatch is null) - { - // Use a hashtabled Match object if the capture numbers are sparse - runmatch = runregex!.caps is null ? - new Match(runregex, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart) : - new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart); - } - else - { - runmatch.Reset(runregex!, runtext!, runtextbeg, runtextend, runtextstart); - } - - // Note we test runcrawl, because it is the last one to be allocated - // If there is an alloc failure in the middle of the three allocations, - // we may still return to reuse this instance, and we want to behave - // as if the allocations didn't occur. - if (runcrawl != null) - { - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl.Length; - return; - } - - // Everything above runs once per match. - // Everything below runs once per runner. - - InitTrackCount(); - - int stacksize; - int tracksize = stacksize = runtrackcount * 8; - - if (tracksize < 32) - { - tracksize = 32; - } - if (stacksize < 16) - { - stacksize = 16; - } - - runtrack = new int[tracksize]; - runtrackpos = tracksize; - - runstack = new int[stacksize]; - runstackpos = stacksize; - - runcrawl = new int[32]; - runcrawlpos = 32; - } + protected virtual void InitTrackCount() { } /// /// Called by the implementation of Go() to increase the size of storage @@ -491,6 +408,13 @@ protected bool IsBoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); } + internal bool IsBoundary(ReadOnlySpan inputSpan, int index) + { + int indexM1 = index - 1; + return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[indexM1])) != + ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); + } + /// Called to determine a char's inclusion in the \w set. internal static bool IsWordChar(char ch) => RegexCharClass.IsWordChar(ch); @@ -500,6 +424,13 @@ protected bool IsECMABoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); } + internal bool IsECMABoundary(ReadOnlySpan inputSpan, int index) + { + int indexM1 = index - 1; + return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[indexM1])) != + ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index])); + } + protected static bool CharInSet(char ch, string set, string category) { string charClass = RegexCharClass.ConvertOldStringsToClass(set, category); @@ -699,7 +630,10 @@ string DescribeTextPosition() if (runtextpos > runtextbeg) { - sb.Append(RegexCharClass.DescribeChar(runtext![runtextpos - 1])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[runtextpos - 1])); + } } else { @@ -710,7 +644,10 @@ string DescribeTextPosition() for (int i = runtextpos; i < runtextend; i++) { - sb.Append(RegexCharClass.DescribeChar(runtext![i])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[i])); + } } if (sb.Length >= 64) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index c0c0ce2dd1464..6ed256639f732 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -86,23 +86,16 @@ internal Runner(SymbolicRegexMatcher matcher) _perThreadData = matcher.CreatePerThreadData(); } - protected override void InitTrackCount() { } // nop, no backtracking - - protected override bool FindFirstChar() => true; // The logic is all in Go. - - protected override void Go() + protected internal override void Scan(ReadOnlySpan text) { - int beginning = runtextbeg; - ReadOnlySpan inputSpan = runtext.AsSpan(beginning, runtextend - beginning); - // Perform the match. - SymbolicMatch pos = _matcher.FindMatch(quick, inputSpan, runtextpos - beginning, _perThreadData); + SymbolicMatch pos = _matcher.FindMatch(quick, text, runtextpos, _perThreadData); // Transfer the result back to the RegexRunner state. if (pos.Success) { // If we successfully matched, capture the match, and then jump the current position to the end of the match. - int start = pos.Index + beginning; + int start = pos.Index; int end = start + pos.Length; if (!quick && pos.CaptureStarts != null) { @@ -113,7 +106,7 @@ protected override void Go() if (pos.CaptureStarts[cap] >= 0) { Debug.Assert(pos.CaptureEnds[cap] >= pos.CaptureStarts[cap]); - Capture(cap, pos.CaptureStarts[cap] + beginning, pos.CaptureEnds[cap] + beginning); + Capture(cap, pos.CaptureStarts[cap], pos.CaptureEnds[cap]); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs new file mode 100644 index 0000000000000..5f40a3c2e56fc --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs @@ -0,0 +1,313 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Threading; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class CustomDerivedRegexScenarioTest + { + [Fact] + public void CallProtectedScanMethodFromCustomDerivedRegex() + { + CustomDerivedRegex regex = new(); + Assert.True(regex.CallScanDirectly(regex, "3456", 0, 4, 0, -1, false).Success); + Assert.False(regex.CallScanDirectly(regex, "456", 0, 3, 0, -1, false).Success); + Assert.Equal("45", regex.CallScanDirectly(regex, "45456", 0, 5, 0, -1, false).Value); + Assert.Equal("896", regex.CallScanDirectly(regex, "45896456", 0, 8, 2, -1, false).Value); + Assert.Equal(Match.Empty, regex.CallScanDirectly(regex, "I dont match", 0, 12, 0, -1, false)); + Assert.Null(regex.CallScanDirectly(regex, "3456", 0, 4, 0, -1, true)); + } + + } + + /// + /// This type was generated using an earlier version of the Regex Source Generator which still overrides Go and FindFirstChar. + /// The purpose of this class is to validate that if a derived RegexRunner is invoking the protected Scan methods, they should call + /// the overridden Go and FindFirstChar methods and return the expected results. + /// + internal class CustomDerivedRegex : Regex + { + private CustomRegexRunnerFactory.CustomRegexRunner runner; + + public CustomDerivedRegex() + { + pattern = /*lang=regex*/@"\G(\d{1,3})(?=(?:\d{3})+\b)"; + roptions = RegexOptions.Compiled; + internalMatchTimeout = Timeout.InfiniteTimeSpan; + factory = new CustomRegexRunnerFactory(); + capsize = 2; + MethodInfo createRunnerMethod = typeof(Regex).GetMethod("CreateRunner", BindingFlags.Instance | BindingFlags.NonPublic); + runner = createRunnerMethod.Invoke(this, new object[] { }) as CustomRegexRunnerFactory.CustomRegexRunner; + } + + public Match? CallScanDirectly(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) + => runner.CallScanDirectly(regex, text, textbeg, textend, textstart, prevlen, quick); + + internal class CustomRegexRunnerFactory : RegexRunnerFactory + { + protected override RegexRunner CreateInstance() => new CustomRegexRunner(); + + internal class CustomRegexRunner : RegexRunner + { + public Match? CallScanDirectly(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) + => Scan(regex, text, textbeg, textend, textstart, prevlen, quick); + + protected override void InitTrackCount() => base.runtrackcount = 12; + + // Description: + // ○ Match if at the start position. + // ○ 1st capture group. + // ○ Match a Unicode digit greedily at least 1 and at most 3 times. + // ○ Zero-width positive lookahead assertion. + // ○ Loop greedily at least once. + // ○ Match a Unicode digit exactly 3 times. + // ○ Match if at a word boundary. + + protected override bool FindFirstChar() + { + int pos = runtextpos, end = runtextend; + + if (pos < end) + { + // Start \G anchor + if (pos > runtextstart) + { + goto NoStartingPositionFound; + } + return true; + } + + // No starting position found + NoStartingPositionFound: + runtextpos = end; + return false; + } + + protected override void Go() + { + ReadOnlySpan inputSpan = runtext.AsSpan(); + int pos = base.runtextpos, end = base.runtextend; + int original_pos = pos; + int charloop_starting_pos = 0, charloop_ending_pos = 0; + int loop_iteration = 0, loop_starting_pos = 0; + int stackpos = 0; + int start = base.runtextstart; + ReadOnlySpan slice = inputSpan.Slice(pos, end - pos); + + // Match if at the start position. + { + if (pos != start) + { + goto NoMatch; + } + } + + // 1st capture group. + //{ + int capture_starting_pos = pos; + + // Match a Unicode digit greedily at least 1 and at most 3 times. + //{ + charloop_starting_pos = pos; + + int iteration = 0; + while (iteration < 3 && (uint)iteration < (uint)slice.Length && char.IsDigit(slice[iteration])) + { + iteration++; + } + + if (iteration == 0) + { + goto NoMatch; + } + + slice = slice.Slice(iteration); + pos += iteration; + + charloop_ending_pos = pos; + charloop_starting_pos++; + goto CharLoopEnd; + + CharLoopBacktrack: + UncaptureUntil(base.runstack![--stackpos]); + StackPop2(base.runstack, ref stackpos, out charloop_ending_pos, out charloop_starting_pos); + + if (charloop_starting_pos >= charloop_ending_pos) + { + goto NoMatch; + } + pos = --charloop_ending_pos; + slice = inputSpan.Slice(pos, end - pos); + + CharLoopEnd: + StackPush3(ref base.runstack!, ref stackpos, charloop_starting_pos, charloop_ending_pos, base.Crawlpos()); + //} + + base.Capture(1, capture_starting_pos, pos); + + StackPush1(ref base.runstack!, ref stackpos, capture_starting_pos); + goto SkipBacktrack; + + CaptureBacktrack: + capture_starting_pos = base.runstack![--stackpos]; + goto CharLoopBacktrack; + + SkipBacktrack:; + //} + + // Zero-width positive lookahead assertion. + { + int positivelookahead_starting_pos = pos; + + // Loop greedily at least once. + //{ + loop_iteration = 0; + loop_starting_pos = pos; + + LoopBody: + StackPush3(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos); + + loop_starting_pos = pos; + loop_iteration++; + + // Match a Unicode digit exactly 3 times. + { + if ((uint)slice.Length < 3 || + !char.IsDigit(slice[0]) || + !char.IsDigit(slice[1]) || + !char.IsDigit(slice[2])) + { + goto LoopIterationNoMatch; + } + } + + pos += 3; + slice = slice.Slice(3); + if (pos != loop_starting_pos || loop_iteration == 0) + { + goto LoopBody; + } + goto LoopEnd; + + LoopIterationNoMatch: + loop_iteration--; + if (loop_iteration < 0) + { + goto CaptureBacktrack; + } + StackPop2(base.runstack, ref stackpos, out pos, out loop_starting_pos); + UncaptureUntil(base.runstack![--stackpos]); + slice = inputSpan.Slice(pos, end - pos); + if (loop_iteration == 0) + { + goto CaptureBacktrack; + } + if (loop_iteration == 0) + { + goto CaptureBacktrack; + } + LoopEnd:; + //} + + // Match if at a word boundary. + { + if (!base.IsBoundary(pos, base.runtextbeg, end)) + { + goto LoopIterationNoMatch; + } + } + + pos = positivelookahead_starting_pos; + slice = inputSpan.Slice(pos, end - pos); + } + + // The input matched. + base.runtextpos = pos; + base.Capture(0, original_pos, pos); + return; + + // The input didn't match. + NoMatch: + UncaptureUntil(0); + return; + + // Pop 2 values from the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPop2(int[] stack, ref int pos, out int arg0, out int arg1) + { + arg0 = stack[--pos]; + arg1 = stack[--pos]; + } + + // Push 1 value onto the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPush1(ref int[] stack, ref int pos, int arg0) + { + // If there's space available for the value, store it. + int[] s = stack; + int p = pos; + if ((uint)p < (uint)s.Length) + { + s[p] = arg0; + pos++; + return; + } + + // Otherwise, resize the stack to make room and try again. + WithResize(ref stack, ref pos, arg0); + + // Resize the backtracking stack array and push 1 value onto the stack. + [MethodImpl(MethodImplOptions.NoInlining)] + static void WithResize(ref int[] stack, ref int pos, int arg0) + { + Array.Resize(ref stack, (pos + 0) * 2); + StackPush1(ref stack, ref pos, arg0); + } + } + + // Push 3 values onto the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPush3(ref int[] stack, ref int pos, int arg0, int arg1, int arg2) + { + // If there's space available for all 3 values, store them. + int[] s = stack; + int p = pos; + if ((uint)(p + 2) < (uint)s.Length) + { + s[p] = arg0; + s[p + 1] = arg1; + s[p + 2] = arg2; + pos += 3; + return; + } + + // Otherwise, resize the stack to make room and try again. + WithResize(ref stack, ref pos, arg0, arg1, arg2); + + // Resize the backtracking stack array and push 3 values onto the stack. + [MethodImpl(MethodImplOptions.NoInlining)] + static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2) + { + Array.Resize(ref stack, (pos + 2) * 2); + StackPush3(ref stack, ref pos, arg0, arg1, arg2); + } + } + + // Undo captures until we reach the specified capture position. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void UncaptureUntil(int capturepos) + { + while (base.Crawlpos() > capturepos) + { + base.Uncapture(); + } + } + } + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index f1666d9c1e9a4..fa4a8a686c40a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -14,16 +14,89 @@ namespace System.Text.RegularExpressions.Tests { public class PrecompiledRegexScenarioTest { + const string text = "asdf134success1245something"; + const string textWithMultipleMatches = @"asdf134success1245something +bsdf135success1245somethingelse +csdf136success2245somethingnew +dsdf137success3245somethingold"; + [Fact] - public void TestPrecompiledRegex() + public void PrecompiledRegex_MatchesTest() { - string text = "asdf134success1245something"; + string[] expectedMatches = textWithMultipleMatches.Split(Environment.NewLine); RegexTestClass testClass = new RegexTestClass(); + // Test Matches overloads Assert.Equal(1, testClass.Matches(text).Count); + Assert.Equal(0, testClass.Matches(text, startat: 7).Count); + MatchCollection multipleMatches = testClass.Matches(textWithMultipleMatches); + Assert.Equal(4, multipleMatches.Count); + for (int i = 0; i < expectedMatches.Length; i++) + { + Assert.Equal(expectedMatches[i], multipleMatches[i].Value.Trim()); // Calling Trim since the match will contain the new line as part of the match. + } + } + + [Fact] + public void PrecompiledRegex_MatchTest() + { + RegexTestClass testClass = new RegexTestClass(); + Assert.Equal(1, testClass.Match(text).Groups[0].Captures.Count); + Assert.Equal(Match.Empty, testClass.Match(text, beginning: 7, length: text.Length - 7)); + Assert.Equal(5, testClass.Match(text, beginning: 5, length: text.Length - 5).Index); + Assert.False(testClass.Match("asdf134succes1245somethingasdf134success1245something", 0, 27).Success); // The first 27 characters shouldn't match. + Assert.True(testClass.Match("asdf134succes1245somethingasdf134success1245something", 26, 27).Success); // The last 27 characters should match. + Assert.Equal(Match.Empty, testClass.Match(text, startat: 7)); + Assert.Equal(6, testClass.Match(text, startat: 6).Index); + } + + [Fact] + public void PrecompiledRegex_ReplaceTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal("4success", testClass.Replace(text, "$1${output}")); + Assert.Equal("4success", testClass.Replace(text, (match) => + { + return $"{match.Groups[1]}{match.Groups["output"]}"; + })); + Assert.Equal("4success\n5success\n6success\n7success", testClass.Replace(textWithMultipleMatches, "$1${output}")); + } + + [Fact] + public void PrecompiledRegex_SplitTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal(new[] { "", "4", "success", "\n", "5", "success", "\n", "6", "success", "\n", "7", "success", "" }, testClass.Split(textWithMultipleMatches)); + Assert.Equal(new[] { "", "4", "success", $"\nbsdf135success1245somethingelse{Environment.NewLine}csdf136success2245somethingnew{Environment.NewLine}dsdf137success3245somethingold" }, testClass.Split(textWithMultipleMatches, 2)); + } + + [Fact] + public void PrecompiledRegex_CountTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal(4, testClass.Count(textWithMultipleMatches)); + Assert.Equal(4, testClass.Count(textWithMultipleMatches)); + } + + [Fact] + public void PrecompiledRegex_ThrowsWhenSpanIsMatchIsCalled() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Throws(() => testClass.IsMatch(text.AsSpan())); + } + + [Fact] + public void PrecompiledRegex_Groups() + { + RegexTestClass testClass = new RegexTestClass(); + Assert.Equal(text, testClass.Match(text).Groups[0].Value); - Assert.Equal(new int[] { 0, 1, 2}, testClass.GetGroupNumbers()); + Assert.Equal(new int[] { 0, 1, 2 }, testClass.GetGroupNumbers()); Assert.Equal(new string[] { "0", "1", "output" }, testClass.GetGroupNames()); } } @@ -39,8 +112,8 @@ public RegexTestClass() roptions = RegexOptions.IgnoreCase; internalMatchTimeout = TimeSpan.FromTicks(-10000L); factory = new RegexFactoryTestClass(); - Caps = new Hashtable {{0, 0}, {1, 1}, {2, 2}}; - CapNames = new Hashtable {{"0", 0}, {"1", 1}, {"output", 2}}; + Caps = new Hashtable { { 0, 0 }, { 1, 1 }, { 2, 2 } }; + CapNames = new Hashtable { { "0", 0 }, { "1", 1 }, { "output", 2 } }; capslist = new string[3]; capslist[0] = "0"; capslist[1] = "1"; @@ -139,7 +212,7 @@ protected override void Go() } } } - IL_441: + IL_441: while (true) { this.runtrackpos = num2; @@ -169,7 +242,7 @@ protected override void Go() } goto IL_49E; } - IL_4C7: + IL_4C7: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -181,7 +254,7 @@ protected override void Go() continue; } continue; - IL_51D: + IL_51D: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -191,7 +264,7 @@ protected override void Go() runtrack[--num2] = num - 1; runtrack[--num2] = 3; } - IL_204: + IL_204: this.CheckTimeout(); num4 = runstack[num3++]; this.Capture(1, num4, num); @@ -234,21 +307,21 @@ protected override void Go() runtrack[--num2] = num - 1; runtrack[--num2] = 5; } - IL_3FC: + IL_3FC: this.CheckTimeout(); num4 = runstack[num3++]; this.Capture(0, num4, num); runtrack[--num2] = num4; runtrack[num2 - 1] = 4; - IL_432: + IL_432: this.CheckTimeout(); this.runtextpos = num; return; - IL_49E: + IL_49E: this.CheckTimeout(); num = runtrack[num2++]; goto IL_432; - IL_598: + IL_598: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -280,10 +353,10 @@ protected override bool FindFirstChar() while (num2 > 0); bool arg_74_0 = false; goto IL_6C; - IL_63: + IL_63: num--; arg_74_0 = true; - IL_6C: + IL_6C: this.runtextpos = num; return arg_74_0; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 3bb9d23d02901..e839f6fcf8131 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -840,7 +840,7 @@ public void Match(RegexEngine engine, string pattern, string input, RegexOptions if (isDefaultStart && isDefaultCount) { VerifyMatch(r.Match(input)); - Assert.Equal(expectedSuccess, r.IsMatch(input)); + VerifyIsMatch(r, input, expectedSuccess, Regex.InfiniteMatchTimeout); } if (beginning + length == input.Length && (options & RegexOptions.RightToLeft) == 0) { @@ -857,7 +857,7 @@ public void Match(RegexEngine engine, string pattern, string input, RegexOptions case RegexEngine.Compiled: case RegexEngine.NonBacktracking: VerifyMatch(Regex.Match(input, pattern, options | RegexHelpers.OptionsFromEngine(engine))); - Assert.Equal(expectedSuccess, Regex.IsMatch(input, pattern, options | RegexHelpers.OptionsFromEngine(engine))); + VerifyIsMatch(null, input, expectedSuccess, Regex.InfiniteMatchTimeout, pattern, options | RegexHelpers.OptionsFromEngine(engine)); break; } } @@ -1036,9 +1036,9 @@ public void Match_DefaultTimeout_Throws(RegexOptions options) public void Match_CachedPattern_NewTimeoutApplies(RegexOptions options) { const string PatternLeadingToLotsOfBacktracking = @"^(\w+\s?)*$"; - Assert.True(Regex.IsMatch("", PatternLeadingToLotsOfBacktracking, options, TimeSpan.FromDays(1))); + VerifyIsMatch(null, "", true, TimeSpan.FromDays(1), PatternLeadingToLotsOfBacktracking, options); var sw = Stopwatch.StartNew(); - Assert.Throws(() => Regex.IsMatch("An input string that takes a very very very very very very very very very very very long time!", PatternLeadingToLotsOfBacktracking, options, TimeSpan.FromMilliseconds(1))); + VerifyIsMatchThrows(null, "An input string that takes a very very very very very very very very very very very long time!", TimeSpan.FromMilliseconds(1), PatternLeadingToLotsOfBacktracking, options); Assert.InRange(sw.Elapsed.TotalSeconds, 0, 10); // arbitrary upper bound that should be well above what's needed with a 1ms timeout } @@ -1408,7 +1408,7 @@ public async Task Match_Advanced(RegexEngine engine, string pattern, string inpu VerifyMatch(r.Match(input)); VerifyMatch(Regex.Match(input, pattern, options)); - Assert.True(Regex.IsMatch(input, pattern, options)); + VerifyIsMatch(null, input, true, Regex.InfiniteMatchTimeout, pattern, options); } if (beginning + length == input.Length) @@ -1561,9 +1561,9 @@ public void Match_ExcessPrefix(RegexEngine engine) // Should not throw out of memory // Repeaters - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in release + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")), "a", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")), "a", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")), "a", false, Regex.InfiniteMatchTimeout); // cutoff for Boyer-Moore prefix in release // Multis foreach (int length in new[] { 50, 50_000, char.MaxValue + 1 }) @@ -1575,7 +1575,7 @@ public void Match_ExcessPrefix(RegexEngine engine) if (!RegexHelpers.IsNonBacktracking(engine) || length < 50_000) { string s = "bcd" + new string('a', length) + "efg"; - Assert.True((await RegexHelpers.GetRegexAsync(engine, @$"a{{{length}}}")).IsMatch(s)); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @$"a{{{length}}}")), s, true, Regex.InfiniteMatchTimeout); } } }, engine.ToString()).Dispose(); @@ -1625,9 +1625,9 @@ public void IsMatch_Invalid() AssertExtensions.Throws("input", () => r.IsMatch(null, 0)); // Pattern is null - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null)); - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null, RegexOptions.None)); - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null, RegexOptions.None, TimeSpan.FromSeconds(1))); + VerifyIsMatchThrows(null, "input", Regex.InfiniteMatchTimeout, pattern: null); + VerifyIsMatchThrows(null, "input", Regex.InfiniteMatchTimeout, pattern: null, RegexOptions.None); + VerifyIsMatchThrows(null, "input", TimeSpan.FromSeconds(1), pattern: null, RegexOptions.None); // Start is invalid Assert.Throws(() => r.IsMatch("input", -1)); @@ -1650,7 +1650,7 @@ public static IEnumerable IsMatch_SucceedQuicklyDueToLoopReduction_Mem public async Task IsMatch_SucceedQuicklyDueToLoopReduction(RegexEngine engine, string pattern, string input, bool expected) { Regex r = await RegexHelpers.GetRegexAsync(engine, pattern); - Assert.Equal(expected, r.IsMatch(input)); + VerifyIsMatch(r, input, expected, Regex.InfiniteMatchTimeout); } [Theory] @@ -1660,6 +1660,10 @@ public async Task TestCharIsLowerCultureEdgeCasesAroundTurkishCharacters(RegexEn Regex r1 = await RegexHelpers.GetRegexAsync(engine, "[\u012F-\u0130]", RegexOptions.IgnoreCase); Regex r2 = await RegexHelpers.GetRegexAsync(engine, "[\u012F\u0130]", RegexOptions.IgnoreCase); Assert.Equal(r1.IsMatch("\u0130"), r2.IsMatch("\u0130")); +#if NET7_0_OR_GREATER + Assert.Equal(r1.IsMatch("\u0130".AsSpan()), r2.IsMatch("\u0130".AsSpan())); +#endif + } [Fact] @@ -1688,8 +1692,8 @@ public void Synchronized() public async Task Match_Boundary(RegexEngine engine) { Regex r = await RegexHelpers.GetRegexAsync(engine, @"\b\w+\b"); - Assert.False(r.IsMatch(" AB\u200cCD ")); - Assert.False(r.IsMatch(" AB\u200dCD ")); + VerifyIsMatch(r, " AB\u200cCD ", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch(r, " AB\u200dCD ", false, Regex.InfiniteMatchTimeout); } public static IEnumerable Match_Count_TestData() @@ -2002,11 +2006,56 @@ public async Task StandardCharSets_SameMeaningAcrossAllEngines(string singleChar bool baseline = regexes[0].IsMatch(s); for (int i = 1; i < regexes.Count; i++) { - Assert.Equal(baseline, regexes[i].IsMatch(s)); + VerifyIsMatch(regexes[i], s, baseline, Regex.InfiniteMatchTimeout); } } } + private static void VerifyIsMatchThrows(Regex? r, string input, TimeSpan timeout, string? pattern = null, RegexOptions options = RegexOptions.None) + where T : Exception + { + if (r == null) + { + Assert.Throws(() => timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input, pattern, options) : Regex.IsMatch(input, pattern, options, timeout)); +#if NET7_0_OR_GREATER + Assert.Throws(() => timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input.AsSpan(), pattern, options) : Regex.IsMatch(input.AsSpan(), pattern, options, timeout)); +#endif + } + else + { + Assert.Throws(() => r.IsMatch(input)); +#if NET7_0_OR_GREATER + Assert.Throws(() => r.IsMatch(input.AsSpan())); +#endif + } + } + + private static void VerifyIsMatch(Regex? r, string input, bool expected, TimeSpan timeout, string? pattern = null, RegexOptions options = RegexOptions.None) + { + if (r == null) + { + Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input, pattern, options) : Regex.IsMatch(input, pattern, options, timeout)); + if (options == RegexOptions.None) + { + Assert.Equal(expected, Regex.IsMatch(input, pattern)); + } +#if NET7_0_OR_GREATER + Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input.AsSpan(), pattern, options) : Regex.IsMatch(input.AsSpan(), pattern, options, timeout)); + if (options == RegexOptions.None) + { + Assert.Equal(expected, Regex.IsMatch(input.AsSpan(), pattern)); + } +#endif + } + else + { + Assert.Equal(expected, r.IsMatch(input)); +#if NET7_0_OR_GREATER + Assert.Equal(expected, r.IsMatch(input.AsSpan())); +#endif + } + } + public static IEnumerable Match_DisjunctionOverCounting_TestData() { foreach (RegexEngine engine in RegexHelpers.AvailableEngines) diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs new file mode 100644 index 0000000000000..82f2ae5a0336b --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs @@ -0,0 +1,58 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Reflection; +using System.Threading.Tasks; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexRunnerTests + { + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) + { + Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + + // Use reflection to ensure the runner is created so it can be fetched. + MethodInfo createRunnerMethod = typeof(Regex).GetMethod("CreateRunner", BindingFlags.Instance | BindingFlags.NonPublic); + RegexRunner runner = createRunnerMethod.Invoke(re, new object[] { }) as RegexRunner; + + // Use reflection to call Go and FFC and ensure it throws NotImplementedException + MethodInfo goMethod = typeof(RegexRunner).GetMethod("Go", BindingFlags.Instance | BindingFlags.NonPublic); + MethodInfo ffcMethod = typeof(RegexRunner).GetMethod("FindFirstChar", BindingFlags.Instance | BindingFlags.NonPublic); + + // FindFirstChar and Go methods should not be implemented since built-in engines should be overriding and using Scan instead. + TargetInvocationException goInvocationException = Assert.Throws(() => goMethod.Invoke(runner, new object[] { })); + Assert.Equal(typeof(NotImplementedException), goInvocationException.InnerException.GetType()); + TargetInvocationException ffcInvocationException = Assert.Throws(() => ffcMethod.Invoke(runner, new object[] { })); + Assert.Equal(typeof(NotImplementedException), ffcInvocationException.InnerException.GetType()); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task EnsureRunmatchValueIsNulledAfterIsMatch(RegexEngine engine) + { + Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + + // First call IsMatch which should initialize runmatch on the runner. + Assert.True(re.IsMatch("abcabcabc")); + + // Ensure runmatch wasn't nulled out, since after calling IsMatch it should be reused. + FieldInfo runnerField = typeof(Regex).GetField("_runner", BindingFlags.Instance | BindingFlags.NonPublic); + RegexRunner runner = runnerField.GetValue(re) as RegexRunner; + FieldInfo runmatchField = typeof(RegexRunner).GetField("runmatch", BindingFlags.Instance | BindingFlags.NonPublic); + Match runmatch = runmatchField.GetValue(runner) as Match; + Assert.NotNull(runmatch); + + // Ensure that the Value of runmatch was nulled out, so as to not keep a reference to it in a cache. + MethodInfo getTextMethod = typeof(Match).GetMethod("get_Text", BindingFlags.Instance | BindingFlags.NonPublic); + Assert.Null(getTextMethod.Invoke(runmatch, new object[] { })); + Assert.Equal(string.Empty, runmatch.Value); +#if NET7_0_OR_GREATER + Assert.True(runmatch.ValueSpan == ReadOnlySpan.Empty); +#endif + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index d95fa07ad73f1..951257ab4b5b6 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -41,6 +41,8 @@ + +