From 27b0c721f1a078c15d35fe7497ec47a04ea8c15f Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Wed, 16 Feb 2022 16:08:29 -0800 Subject: [PATCH 01/17] Adding Regex.IsMatch(Span) and RegexRunner.Scan() methods --- .../gen/RegexGenerator.Emitter.cs | 61 +++- .../ref/System.Text.RegularExpressions.cs | 13 +- .../System/Text/RegularExpressions/Capture.cs | 23 +- .../RegularExpressions/CompiledRegexRunner.cs | 18 +- .../CompiledRegexRunnerFactory.cs | 17 +- .../System/Text/RegularExpressions/Group.cs | 2 +- .../System/Text/RegularExpressions/Match.cs | 7 +- .../Text/RegularExpressions/Regex.Match.cs | 38 +++ .../System/Text/RegularExpressions/Regex.cs | 198 ++++++++++++- .../Text/RegularExpressions/RegexCompiler.cs | 109 +++++++- .../RegularExpressions/RegexInterpreter.cs | 56 +++- .../RegularExpressions/RegexLWCGCompiler.cs | 15 +- .../Text/RegularExpressions/RegexRunner.cs | 262 ++++++++---------- .../Symbolic/SymbolicRegexRunnerFactory.cs | 15 +- 14 files changed, 600 insertions(+), 234 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index d0a0684806d2f..fe848a0b46f79 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -215,14 +215,19 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" {{"); // Main implementation methods - writer.WriteLine($" protected override void InitTrackCount() => base.runtrackcount = {rm.Code.TrackCount};"); - writer.WriteLine(); - writer.WriteLine(" // Description:"); DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override bool FindFirstChar()"); + writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan text)"); + writer.WriteLine($" {{"); + writer.Indent += 4; + EmitScan(writer, rm, id); + writer.Indent -= 4; + writer.WriteLine($" }}"); + writer.WriteLine(); + + writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id); @@ -233,7 +238,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" protected override void Go()"); + writer.WriteLine($" private bool Go(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; requiredHelpers |= EmitGo(writer, rm, id); @@ -299,6 +304,38 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) } } + private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) + { + using (EmitBlock(writer, "while (true)")) + { + using (EmitBlock(writer, "if (FindFirstChar(text))")) + { + if (rm.MatchTimeout != Timeout.Infinite) + { + writer.WriteLine("base.CheckTimeout();"); + writer.WriteLine(); + } + + writer.WriteLine("// If we got a match, we're done."); + using (EmitBlock(writer, "if (Go(text))")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + } + writer.WriteLine(); + + writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); + using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("base.runtextpos++;"); + } + } + /// Emits the body of the FindFirstChar override. private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id) { @@ -347,7 +384,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); break; @@ -356,13 +392,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitFixedSet(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitLiteralAfterAtomicLoop(); break; @@ -463,7 +497,6 @@ bool EmitAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. writer.WriteLine("// Beginning-of-line anchor"); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); additionalDeclarations.Add("int beginning = base.runtextbeg;"); using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')")) { @@ -763,6 +796,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); writer.WriteLine("base.runtextpos = end;"); + writer.WriteLine("return true;"); return requiredHelpers; case RegexNodeKind.Empty: @@ -770,6 +804,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // source generator and seeing what happens as you add more to expressions. When approaching // it from a learning perspective, this is very common, as it's the empty string you start with. writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);"); + writer.WriteLine("return true;"); return requiredHelpers; } @@ -781,7 +816,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // Declare some locals. string sliceSpan = "slice"; - writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); @@ -826,7 +860,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe } writer.WriteLine("base.runtextpos = pos;"); writer.WriteLine("base.Capture(0, original_pos, pos);"); - writer.WriteLine("return;"); + writer.WriteLine("return true;"); writer.WriteLine(); // We only get here in the code if the whole expression fails to match and jumps to @@ -837,6 +871,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe { EmitUncaptureUntil("0"); } + writer.WriteLine("return false;"); // We're done with the match. @@ -846,8 +881,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // And emit any required helpers. if (additionalLocalFunctions.Count != 0) { - writer.WriteLine("return;"); // not strictly necessary, just for readability - foreach (KeyValuePair localFunctions in additionalLocalFunctions.OrderBy(k => k.Key)) { writer.WriteLine(); @@ -2150,7 +2183,7 @@ void EmitBoundary(RegexNode node) _ => "base.IsECMABoundary", }; - using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) { writer.WriteLine($"goto {doneLabel};"); } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 251d595081072..3abad4033aad4 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila public string GroupNameFromNumber(int i) { throw null; } public int GroupNumberFromName(string name) { throw null; } protected void InitializeReferences() { } + public bool IsMatch(System.ReadOnlySpan input) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } public bool IsMatch(string input) { throw null; } public bool IsMatch(string input, int startat) { throw null; } public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } @@ -330,17 +334,20 @@ protected void DoubleCrawl() { } protected void DoubleStack() { } protected void DoubleTrack() { } protected void EnsureStorage() { } - protected abstract bool FindFirstChar(); - protected abstract void Go(); - protected abstract void InitTrackCount(); + protected virtual bool FindFirstChar() { throw null; } + protected virtual void Go() { throw null; } + protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsMatched(int cap) { throw null; } protected int MatchIndex(int cap) { throw null; } protected int MatchLength(int cap) { throw null; } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } + protected internal virtual void Scan(System.ReadOnlySpan text) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs index 81683e09bec1f..2eee81fadc66f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions /// public class Capture { - internal Capture(string text, int index, int length) + internal Capture(string? text, int index, int length) { Text = text; Index = index; @@ -19,27 +19,38 @@ internal Capture(string text, int index, int length) /// Returns the position in the original string where the first character of captured substring was found. public int Index { get; private protected set; } + /// + /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of + /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices + /// to account for the original text beginning. + /// + /// The original text's beginning offset. + internal void AddBeginningToIndex(int beginning) + { + Index += beginning; + } + /// Returns the length of the captured substring. public int Length { get; private protected set; } /// The original string - internal string Text { get; set; } + internal string? Text { get; set; } /// Gets the captured substring from the input string. /// The substring that is captured by the match. - public string Value => Text.Substring(Index, Length); + public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty; /// Gets the captured span from the input string. /// The span that is captured by the match. - public ReadOnlySpan ValueSpan => Text.AsSpan(Index, Length); + public ReadOnlySpan ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan.Empty; /// Returns the substring that was matched. public override string ToString() => Value; /// The substring to the left of the capture - internal ReadOnlyMemory GetLeftSubstring() => Text.AsMemory(0, Index); + internal ReadOnlyMemory GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory.Empty; /// The substring to the right of the capture - internal ReadOnlyMemory GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length); + internal ReadOnlyMemory GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory.Empty; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index c9e6419fed87c..459ca6ed590a7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -5,20 +5,16 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunner : RegexRunner { - private readonly Action _goMethod; - private readonly Func _findFirstCharMethod; + private readonly ScanDelegate _scanMethod; - public CompiledRegexRunner(Action go, Func findFirstChar, int trackCount) + internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); + + public CompiledRegexRunner(ScanDelegate scan) { - _goMethod = go; - _findFirstCharMethod = findFirstChar; - runtrackcount = trackCount; + _scanMethod = scan; } - protected override void Go() => _goMethod(this); - - protected override bool FindFirstChar() => _findFirstCharMethod(this); - - protected override void InitTrackCount() { } + protected internal override void Scan(ReadOnlySpan text) + => _scanMethod(this, text); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index 958d5cf3dc16f..ab5a5ed4f913e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -7,25 +7,18 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { - private readonly DynamicMethod _goMethod; - private readonly DynamicMethod _findFirstCharMethod; - private readonly int _trackcount; + private readonly DynamicMethod _scanMethod; // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed. - private Action? _go; - private Func? _findFirstChar; + private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, int trackcount) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod) { - _goMethod = goMethod; - _findFirstCharMethod = findFirstCharMethod; - _trackcount = trackcount; + _scanMethod = scanMethod; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _go ??= _goMethod.CreateDelegate>(), - _findFirstChar ??= _findFirstCharMethod.CreateDelegate>(), - _trackcount); + _scan ??= _scanMethod.CreateDelegate()); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs index f4b2a7fb2e980..2c34694f1ecaf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs @@ -16,7 +16,7 @@ public class Group : Capture internal int _capcount; internal CaptureCollection? _capcoll; - internal Group(string text, int[] caps, int capcount, string name) + internal Group(string? text, int[] caps, int capcount, string name) : base(text, capcount == 0 ? 0 : caps[(capcount - 1) * 2], capcount == 0 ? 0 : caps[(capcount * 2) - 1]) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 3c67526b40e18..19859fd2f0b2d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -50,7 +50,7 @@ public class Match : Group internal bool _balancing; // whether we've done any balancing with this match. If we // have done balancing, we'll need to do extra work in Tidy(). - internal Match(Regex? regex, int capcount, string text, int begpos, int len, int startpos) : + internal Match(Regex? regex, int capcount, string? text, int begpos, int len, int startpos) : base(text, new int[2], 0, "0") { _regex = regex; @@ -66,7 +66,7 @@ internal Match(Regex? regex, int capcount, string text, int begpos, int len, int /// Returns an empty Match object. public static Match Empty { get; } = new Match(null, 1, string.Empty, 0, 0, 0); - internal void Reset(Regex regex, string text, int textbeg, int textend, int textstart) + internal void Reset(Regex regex, string? text, int textbeg, int textend, int textstart) { _regex = regex; Text = text; @@ -94,6 +94,7 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text public Match NextMatch() { Regex? r = _regex; + Debug.Assert(Text != null); return r != null ? r.Run(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; @@ -338,7 +339,7 @@ internal sealed class MatchSparse : Match { private new readonly Hashtable _caps; - internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int begpos, int len, int startpos) : + internal MatchSparse(Regex regex, Hashtable caps, int capcount, string? text, int begpos, int len, int startpos) : base(regex, capcount, text, begpos, len, startpos) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index cd0494c3dc009..d50e4dc396c5a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -13,6 +13,15 @@ public partial class Regex public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => RegexCache.GetOrAdd(pattern).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => + RegexCache.GetOrAdd(pattern).IsMatch(input); + /// /// Searches the input string for one or more occurrences of the text /// supplied in the pattern parameter with matching options supplied in the options @@ -21,9 +30,30 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => + RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern under the specified timeout. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// Max time to be used for matching before returning. + /// if the input matches the pattern, otherwise. Also returns for time out. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => + RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// /// Searches the input string for one or more matches using the previous pattern, /// options, and starting position. @@ -38,6 +68,14 @@ public bool IsMatch(string input) return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; } + /// + /// Searches the input span for one or more matches using the previous pattern, + /// options, and starting position. + /// + /// if the input matches the pattern, otherwise. + public bool IsMatch(ReadOnlySpan input) => + Run(input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; + /// /// Searches the input string for one or more matches using the previous pattern and options, /// with a new starting position. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index c23f309c9f34d..d62d1cf16480d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -372,7 +372,113 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - return runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + runner.runtext = input; + ReadOnlySpan span = input.AsSpan(beginning, length); + runner.InitializeForScan(this, span, startat - beginning, quick); + + int stoppos = RightToLeft ? 0 : span.Length; + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (runner.runtextstart == stoppos) + { + return RegularExpressions.Match.Empty; + } + + runner.runtextpos += RightToLeft ? -1 : 1; + } + + runner.Scan(span); + + Match? match = runner.runmatch; + // if we got a match, set runmatch to null if quick is true + if (match!._matchcount[0] > 0) + { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + + if (match.Text != input) + { + match.Text = input; + } + + if (quick) + { + runner.runmatch!.Text = null; // Drop reference to text + return null; + } + + runner.runmatch = null; + match.Tidy(runner.runtextpos); + + // If there was a match and the original text was sliced, then add beginning to the index to get the real + // Index of the match. + if (match.Success && beginning != 0) + { + match.AddBeginningToIndex(beginning); + } + + return match; + } + + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + + if (!quick) + { + runner.runmatch = null; + } + else + { + + if (runner.runmatch != null) + { + runner.runmatch.Text = null; + } + } + + return RegularExpressions.Match.Empty; + } + finally + { + _runner = runner; + } + } + + internal Match? Run(ReadOnlySpan input, int beginning, int length, int startat) + { + if ((uint)startat > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); + } + if ((uint)length > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length, ExceptionResource.LengthNotNegative); + } + + RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); + try + { + runner.InitializeTimeout(internalMatchTimeout); + ReadOnlySpan span = input.Slice(beginning, length); + runner.InitializeForScan(this, span, startat - beginning, quick: true); + + runner.Scan(span); + + Match? match = runner.runmatch; + // if we got a match, set runmatch to null if quick is true + if (match!._matchcount[0] > 0) + { + runner.runmatch!.Text = null; // Drop reference to text + return null; + } + + if (runner.runmatch != null) + { + runner.runmatch.Text = null; + } + + return RegularExpressions.Match.Empty; } finally { @@ -387,7 +493,95 @@ internal void Run(string input, int startat, ref TState state, MatchCall RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.ScanInternal(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + int runtextpos = startat; + while (true) + { + runner.InitializeForScan(this, input, startat, false); + runner.runtextpos = runtextpos; + + int stoppos = RightToLeft ? 0 : input.Length; + runner.Scan(input); + + Match? match = runner.runmatch; + + // if we got a match, set runmatch to null if quick is true + if (match!._matchcount[0] > 0) + { + if (match.Text != input) + { + match.Text = input; + } + + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + + match.Tidy(runner.runtextpos); + if (!callback(ref state, match)) + { + if (reuseMatchObject) + { + // We're reusing the single match instance, so clear out its text as well. + // We don't do this if we're not reusing instances, as in that case we're + // dropping the whole reference to the match, and we no longer own the instance + // having handed it out to the callback. + match.Text = null!; + } + return; + } + + // Now that we've matched successfully, update the starting position to reflect + // the current position, just as Match.NextMatch() would pass in _textpos as textstart. + runtextpos = startat = runner.runtextpos; + + + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; + + if (match.Length == 0) + { + if (runner.runtextpos == stoppos) + { + if (reuseMatchObject) + { + // See above comment. + match.Text = null!; + } + return; + } + + runtextpos += RightToLeft ? -1 : 1; + } + + // Loop around to perform next match from where we left off. + continue; + } + else + { + // We failed to match at this position. If we're at the stopping point, we're done. + if (runner.runtextpos == stoppos) + { + if (!reuseMatchObject) + { + runner.runmatch = null; + } + else + { + if (runner.runmatch != null) + { + runner.runmatch.Text = null!; + } + } + return; + } + } + } } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index fb93630ff9aef..dc3a3c32e085e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Reflection; using System.Reflection.Emit; @@ -20,7 +21,6 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runtextendField = RegexRunnerField("runtextend"); private static readonly FieldInfo s_runtextstartField = RegexRunnerField("runtextstart"); private static readonly FieldInfo s_runtextposField = RegexRunnerField("runtextpos"); - private static readonly FieldInfo s_runtextField = RegexRunnerField("runtext"); private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); @@ -29,9 +29,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_isMatchedMethod = RegexRunnerMethod("IsMatched"); private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength"); private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex"); - private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary"); + private static readonly MethodInfo s_isBoundaryMethod = typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int) })!; private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar"); - private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary"); + private static readonly MethodInfo s_isECMABoundaryMethod = typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int) })!; private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos"); private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass"); private static readonly MethodInfo s_checkTimeoutMethod = RegexRunnerMethod("CheckTimeout"); @@ -180,6 +180,9 @@ internal abstract class RegexCompiler /// A macro for _ilg.Emit(OpCodes.Ldarg_0). protected void Ldthis() => _ilg!.Emit(OpCodes.Ldarg_0); + /// A macro for _ilgEmit(OpCodes.Ldarg_1) + private void Ldarg_1() => _ilg!.Emit(OpCodes.Ldarg_1); + /// A macro for Ldthis(); Ldfld(); protected void Ldthisfld(FieldInfo ft) { @@ -271,6 +274,9 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) private void Switch(Label[] table) => _ilg!.Emit(OpCodes.Switch, table); + /// Declares a local bool. + private LocalBuilder DeclareBool() => _ilg!.DeclareLocal(typeof(bool)); + /// Declares a local int. private LocalBuilder DeclareInt32() => _ilg!.DeclareLocal(typeof(int)); @@ -388,11 +394,10 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Load necessary locals // int pos = base.runtextpos; // int end = base.runtextend; - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; Mvfldloc(s_runtextposField, pos); Mvfldloc(s_runtextendField, end); - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); // Generate length check. If the input isn't long enough to possibly match, fail quickly. @@ -1061,7 +1066,7 @@ protected void EmitGo() // to have been validated in FindFirstChar when doing case-sensitive comparison. // base.Capture(0, base.runtextpos, base.runtextpos + node.Str.Length); // base.runtextpos = base.runtextpos + node.Str.Length; - // return; + // return true; Ldthis(); Dup(); Ldc(0); @@ -1074,6 +1079,7 @@ protected void EmitGo() Ldc(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1); Add(); Stfld(s_runtextposField); + Ldc(1); Ret(); return; @@ -1098,10 +1104,9 @@ protected void EmitGo() // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant InitializeCultureForGoIfNecessary(); - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; // int end = base.runtextend; - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); Mvfldloc(s_runtextendField, end); @@ -1150,6 +1155,9 @@ protected void EmitGo() Ldloc(originalPos); Ldloc(pos); Call(s_captureMethod); + // return true; + Ldc(1); + Ret(); // If the graph contained captures, undo any remaining to handle failed matches. if (expressionHasCaptures) @@ -1180,7 +1188,8 @@ protected void EmitGo() MarkLabel(originalDoneLabel); } - // return; + // return false; + Ldc(0); Ret(); // Generated code successfully. @@ -2313,16 +2322,15 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected type: {node.Kind}"); - // if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; + // if (!IsBoundary(inputSpan, pos + sliceStaticPos)) goto doneLabel; Ldthis(); + Ldloc(inputSpan); Ldloc(pos); if (sliceStaticPos > 0) { Ldc(sliceStaticPos); Add(); } - Ldthisfld(s_runtextbegField); - Ldloc(end); switch (node.Kind) { case RegexNodeKind.Boundary: @@ -3955,6 +3963,79 @@ void EmitStackPop() } } + protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMethod) + { + LocalBuilder bump = DeclareInt32(); + LocalBuilder stoppos = DeclareInt32(); + Label returnLabel = DefineLabel(); + + // int bump = 1 + Ldc(1); + Stloc(bump); + + // int stoppos = text.Length + _ilg!.Emit(OpCodes.Ldarga_S, 1); + Call(s_spanGetLengthMethod); + Stloc(stoppos); + + // while (true) + Label whileLoopEnd = DefineLabel(); + Label whileLoopBody = DefineLabel(); + MarkLabel(whileLoopBody); + + // if (FindFirstChar(text)) + Label afterFindFirstCharLabel = DefineLabel(); + Ldthis(); + Ldarg_1(); + Call(findFirstCharMethod); + BrfalseFar(afterFindFirstCharLabel); + + if (_hasTimeout) + { + // CheckTimeout(); + Ldthis(); + Call(s_checkTimeoutMethod); + } + + // if (Go(text)) + // return; + Label afterSuccessMatchLabel = DefineLabel(); + Ldthis(); + Ldarg_1(); + Call(goMethod); + BrfalseFar(afterSuccessMatchLabel); + BrFar(returnLabel); + MarkLabel(afterSuccessMatchLabel); + + // if (runtextpos == stoppos) + Label incrementRuntextPosLabel = DefineLabel(); + MarkLabel(afterFindFirstCharLabel); + Ldthisfld(s_runtextposField); + Ldloc(stoppos); + Ceq(); + BrfalseFar(incrementRuntextPosLabel); + + // return; + BrFar(returnLabel); + + // runtextpos += bump + MarkLabel(incrementRuntextPosLabel); + Ldthis(); + Ldthisfld(s_runtextposField); + Ldloc(bump); + Add(); + Stfld(s_runtextposField); + + // End loop body. + BrFar(whileLoopBody); + MarkLabel(whileLoopEnd); + + // return; + MarkLabel(returnLabel); + _ilg!.Emit(OpCodes.Nop); + Ret(); + } + private void InitializeCultureForGoIfNecessary() { _textInfo = null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 9f615754a808f..8c96d6e624825 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,15 +324,53 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected override bool FindFirstChar() => - _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); + protected internal override void Scan(ReadOnlySpan text) + { + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = text.Length; + if (runregex!.RightToLeft) + { + bump = -1; + stoppos = 0; + } + + while (true) + { + if (FindFirstChar(text)) + { + CheckTimeout(); + + if (Go(text)) + { + return; + } - protected override void Go() + // Reset state for another iteration. + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl!.Length; + } + + if (runtextpos == stoppos) + { + return; + } + + runtextpos += bump; + } + } + + private bool FindFirstChar(ReadOnlySpan inputSpan) => + _code.FindOptimizations.TryFindNextStartingPosition(inputSpan, ref runtextpos, runtextbeg, runtextstart, runtextend); + + private bool Go(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); _codepos = 0; int advance = -1; - ReadOnlySpan inputSpan = runtext; while (true) { @@ -354,7 +392,7 @@ protected override void Go() switch (_operator) { case RegexOpcode.Stop: - return; + return runmatch!._matchcount[0] > 0; case RegexOpcode.Nothing: break; @@ -711,7 +749,7 @@ protected override void Go() continue; case RegexOpcode.Boundary: - if (!IsBoundary(runtextpos, runtextbeg, runtextend)) + if (!IsBoundary(inputSpan, runtextpos)) { break; } @@ -719,7 +757,7 @@ protected override void Go() continue; case RegexOpcode.NonBoundary: - if (IsBoundary(runtextpos, runtextbeg, runtextend)) + if (IsBoundary(inputSpan, runtextpos)) { break; } @@ -727,7 +765,7 @@ protected override void Go() continue; case RegexOpcode.ECMABoundary: - if (!IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (!IsECMABoundary(inputSpan, runtextpos)) { break; } @@ -735,7 +773,7 @@ protected override void Go() continue; case RegexOpcode.NonECMABoundary: - if (IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (IsECMABoundary(inputSpan, runtextpos)) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 34b7f1b113059..a204e55e002c6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -24,7 +24,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler private static readonly bool s_includePatternInName = Environment.GetEnvironmentVariable(IncludePatternInNamesEnvVar) == "1"; /// Parameter types for the generated Go and FindFirstChar methods. - private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner) }; + private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner), typeof(ReadOnlySpan) }; /// Id number to use for the next compiled regex. private static int s_regexCount; @@ -52,17 +52,20 @@ internal sealed class RegexLWCGCompiler : RegexCompiler description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner)); + DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); EmitFindFirstChar(); - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); + DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); EmitGo(); - return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, code.TrackCount); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); + EmitScan(findFirstCharMethod, goMethod); + + return new CompiledRegexRunnerFactory(scanMethod); } /// Begins the definition of a new method (no args) with a specified return value. - private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType) + private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType, Type[] paramTypes) { // We're claiming that these are static methods, but really they are instance methods. // By giving them a parameter which represents "this", we're tricking them into @@ -71,7 +74,7 @@ private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Typ const MethodAttributes Attribs = MethodAttributes.Public | MethodAttributes.Static; const CallingConventions Conventions = CallingConventions.Standard; - var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, s_paramTypes, hostType, skipVisibility: false); + var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, paramTypes, hostType, skipVisibility: false); _ilg = dm.GetILGenerator(); return dm; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 13a9fbf155bb1..ee4d03b6ab2d6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -88,25 +88,20 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal virtual void Scan(ReadOnlySpan text) { - this.quick = quick; - - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) + string? s = runtext; + if (text != s) { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; + throw new NotSupportedException(); // <-- If we l anded here then we are dealing with a CompiledToAssembly case where the new Span overloads are being used. } + Debug.Assert(runregex != null); + Scan(runregex, s, 0, s.Length, runtextstart, -1, quick, runregex.internalMatchTimeout); + } + + protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + { // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump @@ -118,31 +113,6 @@ protected RegexRunner() { } stoppos = textbeg; } - // Store runtextpos into field, as we may bump it in next check. The remaining arguments - // are stored below once we're past the potential return in the next check. - runtextpos = textstart; - - // If previous match was empty or failed, advance by one before matching. - if (prevlen == 0) - { - if (textstart == stoppos) - { - return Match.Empty; - } - - runtextpos += bump; - } - - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtext = text; - runtextstart = textstart; - runtextbeg = textbeg; - runtextend = textend; - - // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -151,18 +121,7 @@ protected RegexRunner() { } #endif if (FindFirstChar()) { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } - - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } + CheckTimeout(); // See if there's a match at this position. #if DEBUG @@ -170,22 +129,9 @@ protected RegexRunner() { } #endif Go(); - // If we got a match, we're done. - Match match = runmatch!; - if (match._matchcount[0] > 0) + if (runmatch!._matchcount[0] > 0) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - - if (quick) - { - runmatch!.Text = null!; // drop reference - return null; - } - - // Return the match in its canonical form. - runmatch = null; - match.Tidy(runtextpos); - return match; + return runmatch; } // Reset state for another iteration. @@ -197,8 +143,6 @@ protected RegexRunner() { } // We failed to match at this position. If we're at the stopping point, we're done. if (runtextpos == stoppos) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - if (runmatch != null) runmatch.Text = null!; return Match.Empty; } @@ -207,6 +151,86 @@ protected RegexRunner() { } } } + internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textstart, bool quick) + { + this.quick = quick; + // Store remaining arguments into fields now that we're going to start the scan. + // These are referenced by the derived runner. + runregex = regex; + runtextstart = textstart; + runtextbeg = 0; + runtextend = text.Length; + runtextpos = textstart; + + if (runmatch is null) + { + // Use a hashtabled Match object if the capture numbers are sparse + runmatch = runregex!.caps is null ? + new Match(runregex, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart) : + new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart); + } + else + { + runmatch.Reset(runregex!, runtext, runtextbeg, runtextend, runtextstart); + } + + // Note we test runcrawl, because it is the last one to be allocated + // If there is an alloc failure in the middle of the three allocations, + // we may still return to reuse this instance, and we want to behave + // as if the allocations didn't occur. + if (runcrawl != null) + { + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl.Length; + return; + } + + // Everything above runs once per match. + // Everything below runs once per runner. + + InitTrackCount(); + + int stacksize; + int tracksize = stacksize = runtrackcount * 8; + + if (tracksize < 32) + { + tracksize = 32; + } + if (stacksize < 16) + { + stacksize = 16; + } + + runtrack = new int[tracksize]; + runtrackpos = tracksize; + + runstack = new int[stacksize]; + runstackpos = stacksize; + + runcrawl = new int[32]; + runcrawlpos = 32; + } + + internal void InitializeTimeout(TimeSpan timeout) + { + // Handle timeout argument + _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds + bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; + if (!ignoreTimeout) + { + // We are using Environment.TickCount and not Stopwatch for performance reasons. + // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt + // overflow it will still stay ahead of Environment.TickCount for comparisons made + // in DoCheckTimeout(). + Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected + _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; + _timeoutOccursAt = Environment.TickCount + _timeout; + _timeoutChecksToSkip = TimeoutCheckFrequency; + } + } + /// Enumerates all of the matches with the specified regex, invoking the callback for each. /// /// This optionally repeatedly hands out the same Match instance, updated with new information. @@ -250,7 +274,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runtextbeg = 0; // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -264,14 +287,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref DoCheckTimeout(); } - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } - #if DEBUG Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling Go at {nameof(runtextpos)}={runtextpos}"); #endif @@ -291,7 +306,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runmatch = null; } match.Tidy(runtextpos); - initialized = false; if (!callback(ref state, match)) { // If the callback returns false, we're done. @@ -359,7 +373,7 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref } } - protected void CheckTimeout() + protected internal void CheckTimeout() { if (_ignoreTimeout) return; @@ -385,7 +399,9 @@ private void DoCheckTimeout() if (0 > _timeoutOccursAt && 0 < currentMillis) return; - throw new RegexMatchTimeoutException(runtext!, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); + string input = runtext ?? string.Empty; + + throw new RegexMatchTimeoutException(input, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); } /// @@ -394,77 +410,21 @@ private void DoCheckTimeout() /// then to leave runtextpos at the ending position. It should leave /// runtextpos where it started if there was no match. /// - protected abstract void Go(); + protected virtual void Go() => throw new NotImplementedException(); /// /// The responsibility of FindFirstChar() is to advance runtextpos /// until it is at the next position which is a candidate for the /// beginning of a successful match. /// - protected abstract bool FindFirstChar(); + protected virtual bool FindFirstChar() => throw new NotImplementedException(); /// /// InitTrackCount must initialize the runtrackcount field; this is /// used to know how large the initial runtrack and runstack arrays /// must be. /// - protected abstract void InitTrackCount(); - - /// - /// Initializes all the data members that are used by Go() - /// - private void InitializeForGo() - { - if (runmatch is null) - { - // Use a hashtabled Match object if the capture numbers are sparse - runmatch = runregex!.caps is null ? - new Match(runregex, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart) : - new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart); - } - else - { - runmatch.Reset(runregex!, runtext!, runtextbeg, runtextend, runtextstart); - } - - // Note we test runcrawl, because it is the last one to be allocated - // If there is an alloc failure in the middle of the three allocations, - // we may still return to reuse this instance, and we want to behave - // as if the allocations didn't occur. - if (runcrawl != null) - { - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl.Length; - return; - } - - // Everything above runs once per match. - // Everything below runs once per runner. - - InitTrackCount(); - - int stacksize; - int tracksize = stacksize = runtrackcount * 8; - - if (tracksize < 32) - { - tracksize = 32; - } - if (stacksize < 16) - { - stacksize = 16; - } - - runtrack = new int[tracksize]; - runtrackpos = tracksize; - - runstack = new int[stacksize]; - runstackpos = stacksize; - - runcrawl = new int[32]; - runcrawlpos = 32; - } + protected virtual void InitTrackCount() { } /// /// Called by the implementation of Go() to increase the size of storage @@ -491,6 +451,12 @@ protected bool IsBoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); } + protected bool IsBoundary(ReadOnlySpan inputSpan, int index) + { + return (index > runtextbeg && RegexCharClass.IsBoundaryWordChar(inputSpan[index - 1])) != + (index < inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); + } + /// Called to determine a char's inclusion in the \w set. internal static bool IsWordChar(char ch) => RegexCharClass.IsWordChar(ch); @@ -500,6 +466,12 @@ protected bool IsECMABoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); } + protected bool IsECMABoundary(ReadOnlySpan inputSpan, int index) + { + return (index > runtextbeg && RegexCharClass.IsECMAWordChar(inputSpan[index - 1])) != + (index < inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index])); + } + protected static bool CharInSet(char ch, string set, string category) { string charClass = RegexCharClass.ConvertOldStringsToClass(set, category); @@ -699,7 +671,10 @@ string DescribeTextPosition() if (runtextpos > runtextbeg) { - sb.Append(RegexCharClass.DescribeChar(runtext![runtextpos - 1])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[runtextpos - 1])); + } } else { @@ -710,7 +685,10 @@ string DescribeTextPosition() for (int i = runtextpos; i < runtextend; i++) { - sb.Append(RegexCharClass.DescribeChar(runtext![i])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[i])); + } } if (sb.Length >= 64) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 1abe7f9916077..2fdb7581bb48d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -91,23 +91,16 @@ internal Runner(SymbolicRegexMatcher matcher) _perThreadData = _matcher.CreatePerThreadData(); } - protected override void InitTrackCount() { } // nop, no backtracking - - protected override bool FindFirstChar() => true; // The logic is all in Go. - - protected override void Go() + protected internal override void Scan(ReadOnlySpan text) { - int beginning = runtextbeg; - ReadOnlySpan inputSpan = runtext.AsSpan(beginning, runtextend - beginning); - // Perform the match. - SymbolicMatch pos = _matcher.FindMatch(quick, inputSpan, runtextpos - beginning, _perThreadData); + SymbolicMatch pos = _matcher.FindMatch(quick, text, runtextpos, _perThreadData); // Transfer the result back to the RegexRunner state. if (pos.Success) { // If we successfully matched, capture the match, and then jump the current position to the end of the match. - int start = pos.Index + beginning; + int start = pos.Index; int end = start + pos.Length; if (!quick && pos.CaptureStarts != null) { @@ -118,7 +111,7 @@ protected override void Go() if (pos.CaptureStarts[cap] >= 0) { Debug.Assert(pos.CaptureEnds[cap] >= pos.CaptureStarts[cap]); - Capture(cap, pos.CaptureStarts[cap] + beginning, pos.CaptureEnds[cap] + beginning); + Capture(cap, pos.CaptureStarts[cap], pos.CaptureEnds[cap]); } } } From 6f933a3fcb144225a5fa03eef3ed6d8d45d68d76 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 17 Feb 2022 14:17:24 -0800 Subject: [PATCH 02/17] Addressing some PR Feedback --- .../gen/RegexGenerator.Emitter.cs | 3 +- .../src/Resources/Strings.resx | 64 ++++++++++++++++++- .../System/Text/RegularExpressions/Match.cs | 5 ++ .../Text/RegularExpressions/Regex.Match.cs | 2 +- .../System/Text/RegularExpressions/Regex.cs | 30 ++------- .../Text/RegularExpressions/RegexCompiler.cs | 40 ++++-------- .../RegularExpressions/RegexInterpreter.cs | 9 +-- .../Text/RegularExpressions/RegexRunner.cs | 26 ++++++-- .../tests/PrecompiledRegexScenarioTest.cs | 10 +++ 9 files changed, 120 insertions(+), 69 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index fe848a0b46f79..bbdd89dacb2f6 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -321,11 +321,10 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i { writer.WriteLine("return;"); } - writer.WriteLine(); } writer.WriteLine(); - writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); + writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we're done."); using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) { writer.WriteLine("return;"); diff --git a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx index 154f34512ed1e..ebfd629548c0b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx +++ b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx @@ -1,5 +1,64 @@  + @@ -254,4 +313,7 @@ balancing group (?<name1-name2>subexpression) or (?'name1-name2' subexpression) - + + Attempted to call Span overloads with a pre compiled Regex engine is not supported. Please use the string overloads instead. + + \ No newline at end of file diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 19859fd2f0b2d..93ccb73fa3d9d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -84,6 +84,11 @@ internal void Reset(Regex regex, string? text, int textbeg, int textend, int tex _groupcoll?.Reset(); } + internal bool FoundAMatch + { + get => _matchcount[0] > 0; + } + public virtual GroupCollection Groups => _groupcoll ??= new GroupCollection(this, null); /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index d50e4dc396c5a..af023dd99c58d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -74,7 +74,7 @@ public bool IsMatch(string input) /// /// if the input matches the pattern, otherwise. public bool IsMatch(ReadOnlySpan input) => - Run(input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; + Run(input, UseOptionR() ? input.Length : 0) is null; /// /// Searches the input string for one or more matches using the previous pattern and options, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index d62d1cf16480d..1208b40e981f6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -394,7 +394,7 @@ protected void InitializeReferences() Match? match = runner.runmatch; // if we got a match, set runmatch to null if quick is true - if (match!._matchcount[0] > 0) + if (match!.FoundAMatch) { runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache @@ -445,40 +445,24 @@ protected void InitializeReferences() } } - internal Match? Run(ReadOnlySpan input, int beginning, int length, int startat) + internal Match? Run(ReadOnlySpan input, int startat) { if ((uint)startat > (uint)input.Length) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); } - if ((uint)length > (uint)input.Length) - { - ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length, ExceptionResource.LengthNotNegative); - } RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { runner.InitializeTimeout(internalMatchTimeout); - ReadOnlySpan span = input.Slice(beginning, length); - runner.InitializeForScan(this, span, startat - beginning, quick: true); + runner.InitializeForScan(this, input, startat, quick: true); - runner.Scan(span); + runner.Scan(input); - Match? match = runner.runmatch; - // if we got a match, set runmatch to null if quick is true - if (match!._matchcount[0] > 0) - { - runner.runmatch!.Text = null; // Drop reference to text - return null; - } - - if (runner.runmatch != null) - { - runner.runmatch.Text = null; - } + runner.runmatch!.Text = null; // Drop reference to text - return RegularExpressions.Match.Empty; + return runner.runmatch!.FoundAMatch ? null : RegularExpressions.Match.Empty; } finally { @@ -506,7 +490,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall Match? match = runner.runmatch; // if we got a match, set runmatch to null if quick is true - if (match!._matchcount[0] > 0) + if (match!.FoundAMatch) { if (match.Text != input) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index dc3a3c32e085e..ae0e619de4416 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3965,30 +3965,19 @@ void EmitStackPop() protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMethod) { - LocalBuilder bump = DeclareInt32(); - LocalBuilder stoppos = DeclareInt32(); Label returnLabel = DefineLabel(); - // int bump = 1 - Ldc(1); - Stloc(bump); - - // int stoppos = text.Length - _ilg!.Emit(OpCodes.Ldarga_S, 1); - Call(s_spanGetLengthMethod); - Stloc(stoppos); - // while (true) Label whileLoopEnd = DefineLabel(); Label whileLoopBody = DefineLabel(); MarkLabel(whileLoopBody); // if (FindFirstChar(text)) - Label afterFindFirstCharLabel = DefineLabel(); + Label postWhileLabel = DefineLabel(); Ldthis(); Ldarg_1(); Call(findFirstCharMethod); - BrfalseFar(afterFindFirstCharLabel); + BrfalseFar(postWhileLabel); if (_hasTimeout) { @@ -3999,30 +3988,24 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho // if (Go(text)) // return; - Label afterSuccessMatchLabel = DefineLabel(); Ldthis(); Ldarg_1(); Call(goMethod); - BrfalseFar(afterSuccessMatchLabel); - BrFar(returnLabel); - MarkLabel(afterSuccessMatchLabel); + BrtrueFar(returnLabel); - // if (runtextpos == stoppos) - Label incrementRuntextPosLabel = DefineLabel(); - MarkLabel(afterFindFirstCharLabel); + // if (runtextpos == text.length) + // return; + MarkLabel(postWhileLabel); Ldthisfld(s_runtextposField); - Ldloc(stoppos); + _ilg!.Emit(OpCodes.Ldarga_S, 1); + Call(s_spanGetLengthMethod); Ceq(); - BrfalseFar(incrementRuntextPosLabel); + BrtrueFar(returnLabel); - // return; - BrFar(returnLabel); - - // runtextpos += bump - MarkLabel(incrementRuntextPosLabel); + // runtextpos += 1 Ldthis(); Ldthisfld(s_runtextposField); - Ldloc(bump); + Ldc(1); Add(); Stfld(s_runtextposField); @@ -4032,7 +4015,6 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho // return; MarkLabel(returnLabel); - _ilg!.Emit(OpCodes.Nop); Ret(); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 8c96d6e624825..a75c74ed18dbb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -327,7 +327,7 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) protected internal override void Scan(ReadOnlySpan text) { // Configure the additional value to "bump" the position along each time we loop around - // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // to call TryFindNextStartingPosition again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump // by -1 and stop at textbeg. int bump = 1, stoppos = text.Length; @@ -339,7 +339,7 @@ protected internal override void Scan(ReadOnlySpan text) while (true) { - if (FindFirstChar(text)) + if (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextbeg, runtextstart, runtextend)) { CheckTimeout(); @@ -363,9 +363,6 @@ protected internal override void Scan(ReadOnlySpan text) } } - private bool FindFirstChar(ReadOnlySpan inputSpan) => - _code.FindOptimizations.TryFindNextStartingPosition(inputSpan, ref runtextpos, runtextbeg, runtextstart, runtextend); - private bool Go(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); @@ -392,7 +389,7 @@ private bool Go(ReadOnlySpan inputSpan) switch (_operator) { case RegexOpcode.Stop: - return runmatch!._matchcount[0] > 0; + return runmatch!.FoundAMatch; case RegexOpcode.Nothing: break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index ee4d03b6ab2d6..7f37110e1020f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -90,14 +90,26 @@ protected RegexRunner() { } protected internal virtual void Scan(ReadOnlySpan text) { - string? s = runtext; - if (text != s) + string s = runtext!; + // If beginning was passed in, then runtext and span won't match lengths so calculate the beginning to be used for the comparison. + int beginning = s.Length != text.Length ? s.Length - text.Length : 0; + if (text != s.AsSpan(beginning, text.Length)) { - throw new NotSupportedException(); // <-- If we l anded here then we are dealing with a CompiledToAssembly case where the new Span overloads are being used. + // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called. + throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly); } - Debug.Assert(runregex != null); - Scan(runregex, s, 0, s.Length, runtextstart, -1, quick, runregex.internalMatchTimeout); + // If beginning wasn't zero, then we have to adjust some of the + // internal fields of RegexRunner to ensure the Precompiled Go and FFC + // will work as expected. + if (beginning != 0) + { + runtextbeg = beginning; + runtextstart += beginning; + runtextend += beginning; + } + + Scan(runregex!, s, beginning, beginning + text.Length, runtextstart + beginning, -1, quick, runregex!.internalMatchTimeout); } protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) @@ -129,7 +141,7 @@ protected internal virtual void Scan(ReadOnlySpan text) #endif Go(); - if (runmatch!._matchcount[0] > 0) + if (runmatch!.FoundAMatch) { return runmatch; } @@ -296,7 +308,7 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref // See if we have a match. Match match = runmatch!; - if (match._matchcount[0] > 0) + if (match.FoundAMatch) { // Hand it out to the callback in canonical form. if (!reuseMatchObject) diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index f1666d9c1e9a4..e7939a11e63b4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -20,8 +20,18 @@ public void TestPrecompiledRegex() string text = "asdf134success1245something"; RegexTestClass testClass = new RegexTestClass(); + // Test Matches overloads Assert.Equal(1, testClass.Matches(text).Count); + Assert.Equal(0, testClass.Matches(text, startat: 7).Count); + + // Test Match overloads Assert.Equal(1, testClass.Match(text).Groups[0].Captures.Count); + Assert.Equal(Match.Empty, testClass.Match(text, beginning: 7, length: text.Length - 7)); + Assert.Equal(5, testClass.Match(text, beginning: 5, length: text.Length - 5).Index); + Assert.Equal(Match.Empty, testClass.Match(text, startat: 7)); + Assert.Equal(6, testClass.Match(text, startat: 6).Index); + + // Test Groups Assert.Equal(text, testClass.Match(text).Groups[0].Value); Assert.Equal(new int[] { 0, 1, 2}, testClass.GetGroupNumbers()); Assert.Equal(new string[] { "0", "1", "output" }, testClass.GetGroupNames()); From 7bfb28c76333e9ee8a054dd841545f49322076e3 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 17 Feb 2022 14:44:41 -0800 Subject: [PATCH 03/17] Fixing case where span-based APIs are called from a precompiled regex and adding a test for it. --- .../src/System/Text/RegularExpressions/RegexRunner.cs | 6 +++--- .../tests/PrecompiledRegexScenarioTest.cs | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 7f37110e1020f..9d73595d4cba1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -90,10 +90,10 @@ protected RegexRunner() { } protected internal virtual void Scan(ReadOnlySpan text) { - string s = runtext!; + string? s = runtext; // If beginning was passed in, then runtext and span won't match lengths so calculate the beginning to be used for the comparison. - int beginning = s.Length != text.Length ? s.Length - text.Length : 0; - if (text != s.AsSpan(beginning, text.Length)) + int beginning = (s != null && s.Length != text.Length) ? s.Length - text.Length : 0; + if (s == null || text != s.AsSpan(beginning, text.Length)) { // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called. throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly); diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index e7939a11e63b4..a52496948a12a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -31,6 +31,9 @@ public void TestPrecompiledRegex() Assert.Equal(Match.Empty, testClass.Match(text, startat: 7)); Assert.Equal(6, testClass.Match(text, startat: 6).Index); + // Test Span-based IsMatch throws for Precompiled scenario. + Assert.Throws(() => testClass.IsMatch(text.AsSpan())); + // Test Groups Assert.Equal(text, testClass.Match(text).Groups[0].Value); Assert.Equal(new int[] { 0, 1, 2}, testClass.GetGroupNumbers()); From 95c9f3d4b7e5862e00bb3d0529cd08dcca72ab72 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 18 Feb 2022 10:45:34 -0800 Subject: [PATCH 04/17] Adding Tests for IsMatch span overload --- .../tests/Regex.Match.Tests.cs | 81 ++++++++++++++----- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 3bb9d23d02901..5463ea85c0de4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -840,7 +840,7 @@ public void Match(RegexEngine engine, string pattern, string input, RegexOptions if (isDefaultStart && isDefaultCount) { VerifyMatch(r.Match(input)); - Assert.Equal(expectedSuccess, r.IsMatch(input)); + VerifyIsMatch(r, input, expectedSuccess, Regex.InfiniteMatchTimeout); } if (beginning + length == input.Length && (options & RegexOptions.RightToLeft) == 0) { @@ -857,7 +857,7 @@ public void Match(RegexEngine engine, string pattern, string input, RegexOptions case RegexEngine.Compiled: case RegexEngine.NonBacktracking: VerifyMatch(Regex.Match(input, pattern, options | RegexHelpers.OptionsFromEngine(engine))); - Assert.Equal(expectedSuccess, Regex.IsMatch(input, pattern, options | RegexHelpers.OptionsFromEngine(engine))); + VerifyIsMatch(null, input, expectedSuccess, Regex.InfiniteMatchTimeout, pattern, options | RegexHelpers.OptionsFromEngine(engine)); break; } } @@ -1011,20 +1011,20 @@ public void Match_DefaultTimeout_Throws(RegexOptions options) if ((RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture) == RegexOptions.None) { Assert.Throws(() => new Regex(Pattern).Match(input)); - Assert.Throws(() => new Regex(Pattern).IsMatch(input)); + VerifyIsMatchThrows(new Regex(Pattern), input, Regex.InfiniteMatchTimeout); Assert.Throws(() => new Regex(Pattern).Matches(input).Count); Assert.Throws(() => Regex.Match(input, Pattern)); - Assert.Throws(() => Regex.IsMatch(input, Pattern)); + VerifyIsMatchThrows(null, input, Regex.InfiniteMatchTimeout, Pattern); Assert.Throws(() => Regex.Matches(input, Pattern).Count); } Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Match(input)); - Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).IsMatch(input)); + VerifyIsMatchThrows(new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)), input, Regex.InfiniteMatchTimeout); Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Matches(input).Count); Assert.Throws(() => Regex.Match(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture))); - Assert.Throws(() => Regex.IsMatch(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture))); + VerifyIsMatchThrows(null, input, Regex.InfiniteMatchTimeout, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)); Assert.Throws(() => Regex.Matches(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Count); }, ((int)options).ToString(CultureInfo.InvariantCulture)).Dispose(); } @@ -1036,9 +1036,9 @@ public void Match_DefaultTimeout_Throws(RegexOptions options) public void Match_CachedPattern_NewTimeoutApplies(RegexOptions options) { const string PatternLeadingToLotsOfBacktracking = @"^(\w+\s?)*$"; - Assert.True(Regex.IsMatch("", PatternLeadingToLotsOfBacktracking, options, TimeSpan.FromDays(1))); + VerifyIsMatch(null, "", true, TimeSpan.FromDays(1), PatternLeadingToLotsOfBacktracking, options); var sw = Stopwatch.StartNew(); - Assert.Throws(() => Regex.IsMatch("An input string that takes a very very very very very very very very very very very long time!", PatternLeadingToLotsOfBacktracking, options, TimeSpan.FromMilliseconds(1))); + VerifyIsMatchThrows(null, "An input string that takes a very very very very very very very very very very very long time!", TimeSpan.FromMilliseconds(1), PatternLeadingToLotsOfBacktracking, options); Assert.InRange(sw.Elapsed.TotalSeconds, 0, 10); // arbitrary upper bound that should be well above what's needed with a 1ms timeout } @@ -1408,7 +1408,7 @@ public async Task Match_Advanced(RegexEngine engine, string pattern, string inpu VerifyMatch(r.Match(input)); VerifyMatch(Regex.Match(input, pattern, options)); - Assert.True(Regex.IsMatch(input, pattern, options)); + VerifyIsMatch(null, input, true, Regex.InfiniteMatchTimeout, pattern, options); } if (beginning + length == input.Length) @@ -1561,9 +1561,9 @@ public void Match_ExcessPrefix(RegexEngine engine) // Should not throw out of memory // Repeaters - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in release + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")), "a", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")), "a", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")), "a", false, Regex.InfiniteMatchTimeout); // cutoff for Boyer-Moore prefix in release // Multis foreach (int length in new[] { 50, 50_000, char.MaxValue + 1 }) @@ -1575,7 +1575,7 @@ public void Match_ExcessPrefix(RegexEngine engine) if (!RegexHelpers.IsNonBacktracking(engine) || length < 50_000) { string s = "bcd" + new string('a', length) + "efg"; - Assert.True((await RegexHelpers.GetRegexAsync(engine, @$"a{{{length}}}")).IsMatch(s)); + VerifyIsMatch((await RegexHelpers.GetRegexAsync(engine, @$"a{{{length}}}")), s, true, Regex.InfiniteMatchTimeout); } } }, engine.ToString()).Dispose(); @@ -1625,9 +1625,9 @@ public void IsMatch_Invalid() AssertExtensions.Throws("input", () => r.IsMatch(null, 0)); // Pattern is null - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null)); - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null, RegexOptions.None)); - AssertExtensions.Throws("pattern", () => Regex.IsMatch("input", null, RegexOptions.None, TimeSpan.FromSeconds(1))); + VerifyIsMatchThrows(null, "input", Regex.InfiniteMatchTimeout, pattern: null); + VerifyIsMatchThrows(null, "input", Regex.InfiniteMatchTimeout, pattern: null, RegexOptions.None); + VerifyIsMatchThrows(null, "input", TimeSpan.FromSeconds(1), pattern: null, RegexOptions.None); // Start is invalid Assert.Throws(() => r.IsMatch("input", -1)); @@ -1650,7 +1650,7 @@ public static IEnumerable IsMatch_SucceedQuicklyDueToLoopReduction_Mem public async Task IsMatch_SucceedQuicklyDueToLoopReduction(RegexEngine engine, string pattern, string input, bool expected) { Regex r = await RegexHelpers.GetRegexAsync(engine, pattern); - Assert.Equal(expected, r.IsMatch(input)); + VerifyIsMatch(r, input, expected, Regex.InfiniteMatchTimeout); } [Theory] @@ -1660,6 +1660,10 @@ public async Task TestCharIsLowerCultureEdgeCasesAroundTurkishCharacters(RegexEn Regex r1 = await RegexHelpers.GetRegexAsync(engine, "[\u012F-\u0130]", RegexOptions.IgnoreCase); Regex r2 = await RegexHelpers.GetRegexAsync(engine, "[\u012F\u0130]", RegexOptions.IgnoreCase); Assert.Equal(r1.IsMatch("\u0130"), r2.IsMatch("\u0130")); +#if NET7_0_OR_GREATER + Assert.Equal(r1.IsMatch("\u0130".AsSpan()), r2.IsMatch("\u0130".AsSpan())); +#endif + } [Fact] @@ -1688,8 +1692,8 @@ public void Synchronized() public async Task Match_Boundary(RegexEngine engine) { Regex r = await RegexHelpers.GetRegexAsync(engine, @"\b\w+\b"); - Assert.False(r.IsMatch(" AB\u200cCD ")); - Assert.False(r.IsMatch(" AB\u200dCD ")); + VerifyIsMatch(r, " AB\u200cCD ", false, Regex.InfiniteMatchTimeout); + VerifyIsMatch(r, " AB\u200dCD ", false, Regex.InfiniteMatchTimeout); } public static IEnumerable Match_Count_TestData() @@ -2002,11 +2006,48 @@ public async Task StandardCharSets_SameMeaningAcrossAllEngines(string singleChar bool baseline = regexes[0].IsMatch(s); for (int i = 1; i < regexes.Count; i++) { - Assert.Equal(baseline, regexes[i].IsMatch(s)); + VerifyIsMatch(regexes[i], s, baseline, Regex.InfiniteMatchTimeout); } } } + private static void VerifyIsMatchThrows(Regex? r, string input, TimeSpan timeout, string? pattern = null, RegexOptions options = RegexOptions.None) + where T : Exception + { + if (r == null) + { + Assert.Throws(() => timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input, pattern, options) : Regex.IsMatch(input, pattern, options, timeout)); +#if NET7_0_OR_GREATER + Assert.Throws(() => timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input.AsSpan(), pattern, options) : Regex.IsMatch(input.AsSpan(), pattern, options, timeout)); +#endif + } + else + { + Assert.Throws(() => r.IsMatch(input)); +#if NET7_0_OR_GREATER + Assert.Throws(() => r.IsMatch(input.AsSpan())); +#endif + } + } + + private static void VerifyIsMatch(Regex? r, string input, bool expected, TimeSpan timeout, string? pattern = null, RegexOptions options = RegexOptions.None) + { + if (r == null) + { + Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input, pattern, options) : Regex.IsMatch(input, pattern, options, timeout)); +#if NET7_0_OR_GREATER + Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input.AsSpan(), pattern, options) : Regex.IsMatch(input.AsSpan(), pattern, options, timeout)); +#endif + } + else + { + Assert.Equal(expected, r.IsMatch(input)); +#if NET7_0_OR_GREATER + Assert.Equal(expected, r.IsMatch(input.AsSpan())); +#endif + } + } + public static IEnumerable Match_DisjunctionOverCounting_TestData() { foreach (RegexEngine engine in RegexHelpers.AvailableEngines) From 6a2043eaf5db49a675470e415d66d4e2637cbd19 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 18 Feb 2022 13:22:04 -0800 Subject: [PATCH 05/17] Refactor Run and Run to share centralized logic --- .../System/Text/RegularExpressions/Regex.cs | 114 ++++++++---------- 1 file changed, 48 insertions(+), 66 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 1208b40e981f6..9f97f2636aa81 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -390,59 +390,66 @@ protected void InitializeReferences() runner.runtextpos += RightToLeft ? -1 : 1; } - runner.Scan(span); - - Match? match = runner.runmatch; - // if we got a match, set runmatch to null if quick is true - if (match!.FoundAMatch) - { - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache - - if (match.Text != input) - { - match.Text = input; - } - - if (quick) - { - runner.runmatch!.Text = null; // Drop reference to text - return null; - } + return InternalPerformScan(quick, input, beginning, runner, span, returnNullIfQuick: true); + } + finally + { + _runner = runner; + } + } - runner.runmatch = null; - match.Tidy(runner.runtextpos); + private static Match? InternalPerformScan(bool quick, string input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfQuick) + { + runner.Scan(span); - // If there was a match and the original text was sliced, then add beginning to the index to get the real - // Index of the match. - if (match.Success && beginning != 0) - { - match.AddBeginningToIndex(beginning); - } + Match? match = runner.runmatch; + // if we got a match, set runmatch to null if quick is true + if (match!.FoundAMatch) + { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache - return match; + if (match.Text != input) + { + match.Text = input; } - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + if (quick && returnNullIfQuick) + { + runner.runmatch!.Text = null; // Drop reference to text + return null; + } if (!quick) - { runner.runmatch = null; - } - else - { - if (runner.runmatch != null) - { - runner.runmatch.Text = null; - } + match.Tidy(runner.runtextpos); + + // If there was a match and the original text was sliced, then add beginning to the index to get the real + // Index of the match. + if (match.Success && beginning != 0) + { + match.AddBeginningToIndex(beginning); } - return RegularExpressions.Match.Empty; + return match; } - finally + + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + + if (!quick) { - _runner = runner; + runner.runmatch = null; + } + else + { + + if (runner.runmatch != null) + { + runner.runmatch.Text = null; + } } + + return RegularExpressions.Match.Empty; } internal Match? Run(ReadOnlySpan input, int startat) @@ -485,26 +492,12 @@ internal void Run(string input, int startat, ref TState state, MatchCall runner.runtextpos = runtextpos; int stoppos = RightToLeft ? 0 : input.Length; - runner.Scan(input); - Match? match = runner.runmatch; + Match? match = InternalPerformScan(reuseMatchObject, input, 0, runner, input, false); // if we got a match, set runmatch to null if quick is true - if (match!.FoundAMatch) + if (match!.Success) { - if (match.Text != input) - { - match.Text = input; - } - - if (!reuseMatchObject) - { - // We're not reusing match objects, so null out our field reference to the instance. - // It'll be recreated the next time one is needed. - runner.runmatch = null; - } - - match.Tidy(runner.runtextpos); if (!callback(ref state, match)) { if (reuseMatchObject) @@ -551,17 +544,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall // We failed to match at this position. If we're at the stopping point, we're done. if (runner.runtextpos == stoppos) { - if (!reuseMatchObject) - { - runner.runmatch = null; - } - else - { - if (runner.runmatch != null) - { - runner.runmatch.Text = null!; - } - } return; } } From c410b50d8bf25762409cbce5d55ca219c9516ec2 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 18 Feb 2022 14:08:20 -0800 Subject: [PATCH 06/17] Emit IsBoundary and IsECMABoundary instead of exposing them. --- .../gen/RegexGenerator.Emitter.cs | 89 +++++++++++++++++-- .../ref/System.Text.RegularExpressions.cs | 2 - .../Text/RegularExpressions/RegexRunner.cs | 4 +- 3 files changed, 84 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index bbdd89dacb2f6..2d6d4ab81efca 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -276,6 +276,68 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" }}"); } + if ((requiredHelpers & RequiredHelperFunctions.IsBoundary) != 0) + { + writer.WriteLine(); + writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); + writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); + writer.WriteLine($" private static bool IsBoundary(global::System.ReadOnlySpaninputSpan, int index, int startpos, int endpos)"); + writer.WriteLine($" {{"); + writer.WriteLine($" return (index > startpos && IsBoundaryWordChar(inputSpan![index - 1])) !="); + writer.WriteLine($" (index < endpos && IsBoundaryWordChar(inputSpan![index]));"); + writer.WriteLine(); + writer.WriteLine($" bool IsBoundaryWordChar(char ch)"); + writer.WriteLine($" {{"); + writer.WriteLine($" global::System.ReadOnlySpan asciiLookup = new byte[]"); + writer.WriteLine($" {{"); + writer.WriteLine($" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,"); + writer.WriteLine($" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07"); + writer.WriteLine($" }};"); + writer.WriteLine(); + writer.WriteLine($" int chDiv8 = ch >> 3;"); + writer.WriteLine($" if ((uint)chDiv8 < (uint)asciiLookup.Length)"); + writer.WriteLine($" {{"); + writer.WriteLine($" return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;"); + writer.WriteLine($" }}"); + writer.WriteLine(); + writer.WriteLine($" switch (global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch))"); + writer.WriteLine($" {{"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.UppercaseLetter:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.LowercaseLetter:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.TitlecaseLetter:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.ModifierLetter:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.OtherLetter:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.NonSpacingMark:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.DecimalDigitNumber:"); + writer.WriteLine($" case global::System.Globalization.UnicodeCategory.ConnectorPunctuation:"); + writer.WriteLine($" return true;"); + writer.WriteLine(); + writer.WriteLine($" default:"); + writer.WriteLine($" const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';"); + writer.WriteLine($" return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;"); + writer.WriteLine($" }}"); + writer.WriteLine($" }}"); + writer.WriteLine($" }}"); + } + + if ((requiredHelpers & RequiredHelperFunctions.IsECMABoundary) != 0) + { + writer.WriteLine(); + writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); + writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); + writer.WriteLine($" private static bool IsECMABoundary(global::System.ReadOnlySpaninputSpan, int index, int startpos, int endpos)"); + writer.WriteLine($" {{"); + writer.WriteLine($" return (index > startpos && IsECMAWordChar(inputSpan![index - 1])) !="); + writer.WriteLine($" (index < endpos && IsECMAWordChar(inputSpan![index]));"); + writer.WriteLine(); + writer.WriteLine($" bool IsECMAWordChar(char ch) =>"); + writer.WriteLine($" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter"); + writer.WriteLine($" (((uint)ch - '0') < 10) || // digit"); + writer.WriteLine($" ch == '_' || // underscore"); + writer.WriteLine($" ch == '\u0130'; // latin capital letter I with dot above"); + writer.WriteLine($" }}"); + } + writer.WriteLine($" }}"); writer.WriteLine($" }}"); writer.WriteLine("}"); @@ -2176,13 +2238,22 @@ void EmitBoundary(RegexNode node) string call = node.Kind switch { - RegexNodeKind.Boundary => "!base.IsBoundary", - RegexNodeKind.NonBoundary => "base.IsBoundary", - RegexNodeKind.ECMABoundary => "!base.IsECMABoundary", - _ => "base.IsECMABoundary", + RegexNodeKind.Boundary => "!IsBoundary", + RegexNodeKind.NonBoundary => "IsBoundary", + RegexNodeKind.ECMABoundary => "!IsECMABoundary", + _ => "IsECMABoundary", }; - using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) + var boundaryFunctionRequired = node.Kind switch + { + RegexNodeKind.Boundary or + RegexNodeKind.NonBoundary => RequiredHelperFunctions.IsBoundary, + _ => RequiredHelperFunctions.IsECMABoundary + }; + + requiredHelpers |= boundaryFunctionRequired; + + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) { writer.WriteLine($"goto {doneLabel};"); } @@ -3865,9 +3936,13 @@ public void Dispose() private enum RequiredHelperFunctions { /// No additional functions are required. - None, + None = 0b0, /// The IsWordChar helper is required. - IsWordChar + IsWordChar = 0b1, + /// The IsBoundary helper is required. + IsBoundary = 0b10, + /// The IsECMABoundary helper is required. + IsECMABoundary = 0b100 } } } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 3abad4033aad4..dd6956d11dc7f 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -338,9 +338,7 @@ protected void EnsureStorage() { } protected virtual void Go() { throw null; } protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } - protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } - protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsMatched(int cap) { throw null; } protected int MatchIndex(int cap) { throw null; } protected int MatchLength(int cap) { throw null; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 9d73595d4cba1..8a1658c812516 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -463,7 +463,7 @@ protected bool IsBoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); } - protected bool IsBoundary(ReadOnlySpan inputSpan, int index) + internal bool IsBoundary(ReadOnlySpan inputSpan, int index) { return (index > runtextbeg && RegexCharClass.IsBoundaryWordChar(inputSpan[index - 1])) != (index < inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); @@ -478,7 +478,7 @@ protected bool IsECMABoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); } - protected bool IsECMABoundary(ReadOnlySpan inputSpan, int index) + internal bool IsECMABoundary(ReadOnlySpan inputSpan, int index) { return (index > runtextbeg && RegexCharClass.IsECMAWordChar(inputSpan[index - 1])) != (index < inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index])); From 6d1710bb4858ce6af1f7053ce6d27298a9477b3e Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 18 Feb 2022 15:28:11 -0800 Subject: [PATCH 07/17] Address Emitter changes feedback. --- .../gen/RegexGenerator.Emitter.cs | 52 +++++-------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 2d6d4ab81efca..babede71fe0b9 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -281,42 +281,16 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine(); writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); - writer.WriteLine($" private static bool IsBoundary(global::System.ReadOnlySpaninputSpan, int index, int startpos, int endpos)"); + writer.WriteLine($" private bool IsBoundary(global::System.ReadOnlySpaninputSpan, int index)"); writer.WriteLine($" {{"); - writer.WriteLine($" return (index > startpos && IsBoundaryWordChar(inputSpan![index - 1])) !="); - writer.WriteLine($" (index < endpos && IsBoundaryWordChar(inputSpan![index]));"); + writer.WriteLine($" const char ZeroWidthNonJoiner = '\\u200C', ZeroWidthJoiner = '\\u200D';"); writer.WriteLine(); - writer.WriteLine($" bool IsBoundaryWordChar(char ch)"); - writer.WriteLine($" {{"); - writer.WriteLine($" global::System.ReadOnlySpan asciiLookup = new byte[]"); - writer.WriteLine($" {{"); - writer.WriteLine($" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,"); - writer.WriteLine($" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07"); - writer.WriteLine($" }};"); - writer.WriteLine(); - writer.WriteLine($" int chDiv8 = ch >> 3;"); - writer.WriteLine($" if ((uint)chDiv8 < (uint)asciiLookup.Length)"); - writer.WriteLine($" {{"); - writer.WriteLine($" return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;"); - writer.WriteLine($" }}"); - writer.WriteLine(); - writer.WriteLine($" switch (global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch))"); - writer.WriteLine($" {{"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.UppercaseLetter:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.LowercaseLetter:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.TitlecaseLetter:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.ModifierLetter:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.OtherLetter:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.NonSpacingMark:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.DecimalDigitNumber:"); - writer.WriteLine($" case global::System.Globalization.UnicodeCategory.ConnectorPunctuation:"); - writer.WriteLine($" return true;"); + writer.WriteLine($" return (index > base.runtextbeg && IsBoundaryWordChar(inputSpan![index - 1])) !="); + writer.WriteLine($" (index < inputSpan.Length && IsBoundaryWordChar(inputSpan![index]));"); writer.WriteLine(); - writer.WriteLine($" default:"); - writer.WriteLine($" const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';"); - writer.WriteLine($" return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;"); - writer.WriteLine($" }}"); - writer.WriteLine($" }}"); + writer.WriteLine($" bool IsBoundaryWordChar(char ch) =>"); + writer.WriteLine($" IsWordChar(ch) ||"); + writer.WriteLine($" (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner);"); writer.WriteLine($" }}"); } @@ -325,16 +299,16 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine(); writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); - writer.WriteLine($" private static bool IsECMABoundary(global::System.ReadOnlySpaninputSpan, int index, int startpos, int endpos)"); + writer.WriteLine($" private bool IsECMABoundary(global::System.ReadOnlySpaninputSpan, int index)"); writer.WriteLine($" {{"); - writer.WriteLine($" return (index > startpos && IsECMAWordChar(inputSpan![index - 1])) !="); - writer.WriteLine($" (index < endpos && IsECMAWordChar(inputSpan![index]));"); + writer.WriteLine($" return (index > base.runtextbeg && IsECMAWordChar(inputSpan![index - 1])) !="); + writer.WriteLine($" (index < inputSpan.Length && IsECMAWordChar(inputSpan![index]));"); writer.WriteLine(); writer.WriteLine($" bool IsECMAWordChar(char ch) =>"); writer.WriteLine($" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter"); writer.WriteLine($" (((uint)ch - '0') < 10) || // digit"); writer.WriteLine($" ch == '_' || // underscore"); - writer.WriteLine($" ch == '\u0130'; // latin capital letter I with dot above"); + writer.WriteLine($" ch == '\\u0130'; // latin capital letter I with dot above"); writer.WriteLine($" }}"); } @@ -2247,13 +2221,13 @@ void EmitBoundary(RegexNode node) var boundaryFunctionRequired = node.Kind switch { RegexNodeKind.Boundary or - RegexNodeKind.NonBoundary => RequiredHelperFunctions.IsBoundary, + RegexNodeKind.NonBoundary => RequiredHelperFunctions.IsBoundary | RequiredHelperFunctions.IsWordChar, // IsBoundary internally uses IsWordChar _ => RequiredHelperFunctions.IsECMABoundary }; requiredHelpers |= boundaryFunctionRequired; - using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) { writer.WriteLine($"goto {doneLabel};"); } From 4ca5883d2a72424afa22ce4d7ae21adf5cf4e3b1 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 22 Feb 2022 15:57:06 -0800 Subject: [PATCH 08/17] Addressing PR Feedback. --- .../gen/RegexGenerator.Emitter.cs | 46 +++++++++---------- .../src/Resources/Strings.resx | 2 +- .../System/Text/RegularExpressions/Match.cs | 2 +- .../System/Text/RegularExpressions/Regex.cs | 4 +- .../RegularExpressions/RegexInterpreter.cs | 2 +- .../Text/RegularExpressions/RegexRunner.cs | 4 +- .../tests/PrecompiledRegexScenarioTest.cs | 38 +++++++++------ 7 files changed, 55 insertions(+), 43 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index babede71fe0b9..c622bba167783 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -227,10 +227,10 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" }}"); writer.WriteLine(); - writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan inputSpan)"); + writer.WriteLine($" private bool FindNextPossibleStartingPosition(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id); + RequiredHelperFunctions requiredHelpers = EmitFindNextPossibleStartingPosition(writer, rm, id); writer.Indent -= 4; writer.WriteLine($" }}"); writer.WriteLine(); @@ -238,10 +238,10 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" private bool Go(global::System.ReadOnlySpan inputSpan)"); + writer.WriteLine($" private bool ValidateCurrentPositionMatches(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - requiredHelpers |= EmitGo(writer, rm, id); + requiredHelpers |= EmitValidateCurrentPositionMatches(writer, rm, id); writer.Indent -= 4; writer.WriteLine($" }}"); @@ -281,14 +281,14 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine(); writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); - writer.WriteLine($" private bool IsBoundary(global::System.ReadOnlySpaninputSpan, int index)"); + writer.WriteLine($" private static bool IsBoundary(global::System.ReadOnlySpan inputSpan, int index)"); writer.WriteLine($" {{"); writer.WriteLine($" const char ZeroWidthNonJoiner = '\\u200C', ZeroWidthJoiner = '\\u200D';"); writer.WriteLine(); - writer.WriteLine($" return (index > base.runtextbeg && IsBoundaryWordChar(inputSpan![index - 1])) !="); - writer.WriteLine($" (index < inputSpan.Length && IsBoundaryWordChar(inputSpan![index]));"); + writer.WriteLine($" return (index > 0 && IsBoundaryWordChar(inputSpan[index - 1])) !="); + writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));"); writer.WriteLine(); - writer.WriteLine($" bool IsBoundaryWordChar(char ch) =>"); + writer.WriteLine($" static bool IsBoundaryWordChar(char ch) =>"); writer.WriteLine($" IsWordChar(ch) ||"); writer.WriteLine($" (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner);"); writer.WriteLine($" }}"); @@ -299,12 +299,12 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine(); writer.WriteLine($" /// Determines whether the character at the specified index is a boundary."); writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); - writer.WriteLine($" private bool IsECMABoundary(global::System.ReadOnlySpaninputSpan, int index)"); + writer.WriteLine($" private static bool IsECMABoundary(global::System.ReadOnlySpan inputSpan, int index)"); writer.WriteLine($" {{"); - writer.WriteLine($" return (index > base.runtextbeg && IsECMAWordChar(inputSpan![index - 1])) !="); - writer.WriteLine($" (index < inputSpan.Length && IsECMAWordChar(inputSpan![index]));"); + writer.WriteLine($" return (index > 0 && IsECMAWordChar(inputSpan[index - 1])) !="); + writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));"); writer.WriteLine(); - writer.WriteLine($" bool IsECMAWordChar(char ch) =>"); + writer.WriteLine($" static bool IsECMAWordChar(char ch) =>"); writer.WriteLine($" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter"); writer.WriteLine($" (((uint)ch - '0') < 10) || // digit"); writer.WriteLine($" ch == '_' || // underscore"); @@ -344,7 +344,7 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i { using (EmitBlock(writer, "while (true)")) { - using (EmitBlock(writer, "if (FindFirstChar(text))")) + using (EmitBlock(writer, "if (FindNextPossibleStartingPosition(text))")) { if (rm.MatchTimeout != Timeout.Infinite) { @@ -353,7 +353,7 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i } writer.WriteLine("// If we got a match, we're done."); - using (EmitBlock(writer, "if (Go(text))")) + using (EmitBlock(writer, "if (ValidateCurrentPositionMatches(text))")) { writer.WriteLine("return;"); } @@ -371,8 +371,8 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i } } - /// Emits the body of the FindFirstChar override. - private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id) + /// Emits the body of the FindNextPossibleStartingPosition. + private static RequiredHelperFunctions EmitFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, string id) { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; @@ -461,7 +461,7 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ // searching is required; otherwise, false. bool EmitAnchors() { - // Anchors that fully implement FindFirstChar, with a check that leads to immediate success or failure determination. + // Anchors that fully implement FindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. switch (code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: @@ -778,8 +778,8 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or } } - /// Emits the body of the Go override. - private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) + /// Emits the body of the ValidateCurrentPositionMatches. + private static RequiredHelperFunctions EmitValidateCurrentPositionMatches(IndentedTextWriter writer, RegexMethod rm, string id) { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -798,7 +798,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, - // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // "doneLabel" is simply the final return location from the ValidateCurrentPositionMatches method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated @@ -820,13 +820,13 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); node = node.Child(0); - // In some limited cases, FindFirstChar will only return true if it successfully matched the whole expression. - // We can special case these to do essentially nothing in Go other than emit the capture. + // In some limited cases, FindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. + // We can special case these to do essentially nothing in ValidateCurrentPositionMatches other than emit the capture. switch (node.Kind) { case RegexNodeKind.Multi or RegexNodeKind.Notone or RegexNodeKind.One or RegexNodeKind.Set when !IsCaseInsensitive(node): // This is the case for single and multiple characters, though the whole thing is only guaranteed - // to have been validated in FindFirstChar when doing case-sensitive comparison. + // to have been validated in FindNextPossibleStartingPosition when doing case-sensitive comparison. writer.WriteLine($"int start = base.runtextpos;"); writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); diff --git a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx index ebfd629548c0b..9e905df551a93 100644 --- a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx +++ b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx @@ -314,6 +314,6 @@ balancing group (?<name1-name2>subexpression) or (?'name1-name2' subexpression) - Attempted to call Span overloads with a pre compiled Regex engine is not supported. Please use the string overloads instead. + Searching an input span using a pre-compiled Regex assembly is not supported. Please use the string overloads or use a newer Regex implementation. \ No newline at end of file diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 93ccb73fa3d9d..684274d50bb56 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -84,7 +84,7 @@ internal void Reset(Regex regex, string? text, int textbeg, int textend, int tex _groupcoll?.Reset(); } - internal bool FoundAMatch + internal bool FoundMatch { get => _matchcount[0] > 0; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 9f97f2636aa81..c10a6bb223670 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -404,7 +404,7 @@ protected void InitializeReferences() Match? match = runner.runmatch; // if we got a match, set runmatch to null if quick is true - if (match!.FoundAMatch) + if (match!.FoundMatch) { runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache @@ -469,7 +469,7 @@ protected void InitializeReferences() runner.runmatch!.Text = null; // Drop reference to text - return runner.runmatch!.FoundAMatch ? null : RegularExpressions.Match.Empty; + return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty; } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index a75c74ed18dbb..a7a8f16a00cda 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -389,7 +389,7 @@ private bool Go(ReadOnlySpan inputSpan) switch (_operator) { case RegexOpcode.Stop: - return runmatch!.FoundAMatch; + return runmatch!.FoundMatch; case RegexOpcode.Nothing: break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 8a1658c812516..e51c6efc55840 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -141,7 +141,7 @@ protected internal virtual void Scan(ReadOnlySpan text) #endif Go(); - if (runmatch!.FoundAMatch) + if (runmatch!.FoundMatch) { return runmatch; } @@ -308,7 +308,7 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref // See if we have a match. Match match = runmatch!; - if (match.FoundAMatch) + if (match.FoundMatch) { // Hand it out to the callback in canonical form. if (!reuseMatchObject) diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index a52496948a12a..35e91df8386bb 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -18,11 +18,23 @@ public class PrecompiledRegexScenarioTest public void TestPrecompiledRegex() { string text = "asdf134success1245something"; + string textWithMultipleMatches = @"asdf134success1245something +bsdf135success1245somethingelse +csdf136success2245somethingnew +dsdf137success3245somethingold"; + string[] expectedMatches = textWithMultipleMatches.Split(Environment.NewLine); RegexTestClass testClass = new RegexTestClass(); // Test Matches overloads Assert.Equal(1, testClass.Matches(text).Count); Assert.Equal(0, testClass.Matches(text, startat: 7).Count); + MatchCollection multipleMatches = testClass.Matches(textWithMultipleMatches); + Assert.Equal(4, multipleMatches.Count); + for (int i = 0; i < expectedMatches.Length; i++) + { + Assert.Equal(expectedMatches[i], multipleMatches[i].Value.Trim()); // Calling Trim since the match will contain the new line as part of the match. + } + // Test Match overloads Assert.Equal(1, testClass.Match(text).Groups[0].Captures.Count); @@ -36,7 +48,7 @@ public void TestPrecompiledRegex() // Test Groups Assert.Equal(text, testClass.Match(text).Groups[0].Value); - Assert.Equal(new int[] { 0, 1, 2}, testClass.GetGroupNumbers()); + Assert.Equal(new int[] { 0, 1, 2 }, testClass.GetGroupNumbers()); Assert.Equal(new string[] { "0", "1", "output" }, testClass.GetGroupNames()); } } @@ -52,8 +64,8 @@ public RegexTestClass() roptions = RegexOptions.IgnoreCase; internalMatchTimeout = TimeSpan.FromTicks(-10000L); factory = new RegexFactoryTestClass(); - Caps = new Hashtable {{0, 0}, {1, 1}, {2, 2}}; - CapNames = new Hashtable {{"0", 0}, {"1", 1}, {"output", 2}}; + Caps = new Hashtable { { 0, 0 }, { 1, 1 }, { 2, 2 } }; + CapNames = new Hashtable { { "0", 0 }, { "1", 1 }, { "output", 2 } }; capslist = new string[3]; capslist[0] = "0"; capslist[1] = "1"; @@ -152,7 +164,7 @@ protected override void Go() } } } - IL_441: + IL_441: while (true) { this.runtrackpos = num2; @@ -182,7 +194,7 @@ protected override void Go() } goto IL_49E; } - IL_4C7: + IL_4C7: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -194,7 +206,7 @@ protected override void Go() continue; } continue; - IL_51D: + IL_51D: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -204,7 +216,7 @@ protected override void Go() runtrack[--num2] = num - 1; runtrack[--num2] = 3; } - IL_204: + IL_204: this.CheckTimeout(); num4 = runstack[num3++]; this.Capture(1, num4, num); @@ -247,21 +259,21 @@ protected override void Go() runtrack[--num2] = num - 1; runtrack[--num2] = 5; } - IL_3FC: + IL_3FC: this.CheckTimeout(); num4 = runstack[num3++]; this.Capture(0, num4, num); runtrack[--num2] = num4; runtrack[num2 - 1] = 4; - IL_432: + IL_432: this.CheckTimeout(); this.runtextpos = num; return; - IL_49E: + IL_49E: this.CheckTimeout(); num = runtrack[num2++]; goto IL_432; - IL_598: + IL_598: this.CheckTimeout(); num = runtrack[num2++]; num4 = runtrack[num2++]; @@ -293,10 +305,10 @@ protected override bool FindFirstChar() while (num2 > 0); bool arg_74_0 = false; goto IL_6C; - IL_63: + IL_63: num--; arg_74_0 = true; - IL_6C: + IL_6C: this.runtextpos = num; return arg_74_0; } From 4f0bd06f52602e7562a56f588eae4bacf3448cd7 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 22 Feb 2022 22:08:05 -0800 Subject: [PATCH 09/17] Addressing most of the PR comments --- .../gen/RegexGenerator.Emitter.cs | 56 ++++++++----------- .../System/Text/RegularExpressions/Match.cs | 5 +- .../System/Text/RegularExpressions/Regex.cs | 25 +++++++-- .../Text/RegularExpressions/RegexCompiler.cs | 45 +++++++-------- .../RegularExpressions/RegexInterpreter.cs | 25 +++------ .../RegularExpressions/RegexLWCGCompiler.cs | 10 ++-- .../Text/RegularExpressions/RegexRunner.cs | 10 ++-- 7 files changed, 84 insertions(+), 92 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index c622bba167783..b57134a9c6c30 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -227,10 +227,10 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" }}"); writer.WriteLine(); - writer.WriteLine($" private bool FindNextPossibleStartingPosition(global::System.ReadOnlySpan inputSpan)"); + writer.WriteLine($" private bool TryFindNextPossibleStartingPosition(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - RequiredHelperFunctions requiredHelpers = EmitFindNextPossibleStartingPosition(writer, rm, id); + RequiredHelperFunctions requiredHelpers = EmitTryFindNextPossibleStartingPosition(writer, rm, id); writer.Indent -= 4; writer.WriteLine($" }}"); writer.WriteLine(); @@ -238,10 +238,10 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" private bool ValidateCurrentPositionMatches(global::System.ReadOnlySpan inputSpan)"); + writer.WriteLine($" private bool TryMatchAtCurrentPosition(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; - requiredHelpers |= EmitValidateCurrentPositionMatches(writer, rm, id); + requiredHelpers |= EmitTryMatchAtCurrentPosition(writer, rm, id); writer.Indent -= 4; writer.WriteLine($" }}"); @@ -285,7 +285,8 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" {{"); writer.WriteLine($" const char ZeroWidthNonJoiner = '\\u200C', ZeroWidthJoiner = '\\u200D';"); writer.WriteLine(); - writer.WriteLine($" return (index > 0 && IsBoundaryWordChar(inputSpan[index - 1])) !="); + writer.WriteLine($" int indexM1 = index - 1;"); + writer.WriteLine($" return ((uint)indexM1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexM1])) !="); writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));"); writer.WriteLine(); writer.WriteLine($" static bool IsBoundaryWordChar(char ch) =>"); @@ -301,7 +302,8 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); writer.WriteLine($" private static bool IsECMABoundary(global::System.ReadOnlySpan inputSpan, int index)"); writer.WriteLine($" {{"); - writer.WriteLine($" return (index > 0 && IsECMAWordChar(inputSpan[index - 1])) !="); + writer.WriteLine($" int indexM1 = index - 1;"); + writer.WriteLine($" return ((uint)indexM1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexM1])) !="); writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));"); writer.WriteLine(); writer.WriteLine($" static bool IsECMAWordChar(char ch) =>"); @@ -342,26 +344,16 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) { - using (EmitBlock(writer, "while (true)")) + using (EmitBlock(writer, "while (TryFindNextPossibleStartingPosition(text))")) { - using (EmitBlock(writer, "if (FindNextPossibleStartingPosition(text))")) + if (rm.MatchTimeout != Timeout.Infinite) { - if (rm.MatchTimeout != Timeout.Infinite) - { - writer.WriteLine("base.CheckTimeout();"); - writer.WriteLine(); - } - - writer.WriteLine("// If we got a match, we're done."); - using (EmitBlock(writer, "if (ValidateCurrentPositionMatches(text))")) - { - writer.WriteLine("return;"); - } + writer.WriteLine("base.CheckTimeout();"); + writer.WriteLine(); } - writer.WriteLine(); - writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we're done."); - using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + writer.WriteLine("// If we find a match on the current position, or we have reached the end of the input, we are done."); + using (EmitBlock(writer, "if (TryMatchAtCurrentPosition(text) || base.runtextpos == text.Length)")) { writer.WriteLine("return;"); } @@ -371,8 +363,8 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i } } - /// Emits the body of the FindNextPossibleStartingPosition. - private static RequiredHelperFunctions EmitFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, string id) + /// Emits the body of the TryFindNextPossibleStartingPosition. + private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, string id) { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; @@ -461,7 +453,7 @@ private static RequiredHelperFunctions EmitFindNextPossibleStartingPosition(Inde // searching is required; otherwise, false. bool EmitAnchors() { - // Anchors that fully implement FindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. + // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. switch (code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: @@ -778,8 +770,8 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or } } - /// Emits the body of the ValidateCurrentPositionMatches. - private static RequiredHelperFunctions EmitValidateCurrentPositionMatches(IndentedTextWriter writer, RegexMethod rm, string id) + /// Emits the body of the TryMatchAtCurrentPosition. + private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, string id) { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -798,7 +790,7 @@ private static RequiredHelperFunctions EmitValidateCurrentPositionMatches(Indent // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, - // "doneLabel" is simply the final return location from the ValidateCurrentPositionMatches method that will undo any captures and exit, signaling to + // "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated @@ -820,13 +812,13 @@ private static RequiredHelperFunctions EmitValidateCurrentPositionMatches(Indent Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); node = node.Child(0); - // In some limited cases, FindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. - // We can special case these to do essentially nothing in ValidateCurrentPositionMatches other than emit the capture. + // In some limited cases, TryFindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. + // We can special case these to do essentially nothing in TryMatchAtCurrentPosition other than emit the capture. switch (node.Kind) { case RegexNodeKind.Multi or RegexNodeKind.Notone or RegexNodeKind.One or RegexNodeKind.Set when !IsCaseInsensitive(node): // This is the case for single and multiple characters, though the whole thing is only guaranteed - // to have been validated in FindNextPossibleStartingPosition when doing case-sensitive comparison. + // to have been validated in TryFindNextPossibleStartingPosition when doing case-sensitive comparison. writer.WriteLine($"int start = base.runtextpos;"); writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); @@ -2218,7 +2210,7 @@ void EmitBoundary(RegexNode node) _ => "IsECMABoundary", }; - var boundaryFunctionRequired = node.Kind switch + RequiredHelperFunctions boundaryFunctionRequired = node.Kind switch { RegexNodeKind.Boundary or RegexNodeKind.NonBoundary => RequiredHelperFunctions.IsBoundary | RequiredHelperFunctions.IsWordChar, // IsBoundary internally uses IsWordChar diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 684274d50bb56..fb06af8faafee 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -84,10 +84,7 @@ internal void Reset(Regex regex, string? text, int textbeg, int textend, int tex _groupcoll?.Reset(); } - internal bool FoundMatch - { - get => _matchcount[0] > 0; - } + internal bool FoundMatch => _matchcount[0] > 0; public virtual GroupCollection Groups => _groupcoll ??= new GroupCollection(this, null); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index c10a6bb223670..969f25455f136 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -467,8 +467,6 @@ protected void InitializeReferences() runner.Scan(input); - runner.runmatch!.Text = null; // Drop reference to text - return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty; } finally @@ -485,6 +483,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall try { runner.InitializeTimeout(internalMatchTimeout); + runner.runtext = input; int runtextpos = startat; while (true) { @@ -493,13 +492,24 @@ internal void Run(string input, int startat, ref TState state, MatchCall int stoppos = RightToLeft ? 0 : input.Length; - Match? match = InternalPerformScan(reuseMatchObject, input, 0, runner, input, false); + Match? match = InternalPerformScan(reuseMatchObject, input, 0, runner, input, returnNullIfQuick: false); - // if we got a match, set runmatch to null if quick is true + // if we got a match, then call the callback function with the match and prepare for next iteration. if (match!.Success) { + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + if (!callback(ref state, match)) { + // If the callback returns false, we're done. + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) { // We're reusing the single match instance, so clear out its text as well. @@ -515,7 +525,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall // the current position, just as Match.NextMatch() would pass in _textpos as textstart. runtextpos = startat = runner.runtextpos; - // Reset state for another iteration. runner.runtrackpos = runner.runtrack!.Length; runner.runstackpos = runner.runstack!.Length; @@ -525,6 +534,9 @@ internal void Run(string input, int startat, ref TState state, MatchCall { if (runner.runtextpos == stoppos) { + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) { // See above comment. @@ -544,6 +556,9 @@ internal void Run(string input, int startat, ref TState state, MatchCall // We failed to match at this position. If we're at the stopping point, we're done. if (runner.runtextpos == stoppos) { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + if (!reuseMatchObject) + runner.runmatch = null; return; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index ae0e619de4416..7c073bdbb970a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -190,6 +190,10 @@ protected void Ldthisfld(FieldInfo ft) _ilg!.Emit(OpCodes.Ldfld, ft); } + /// Fetches the address of argument in passed in + /// The position of the argument which address needs to be fetched. + private void Ldarga_s(int position) => _ilg!.Emit(OpCodes.Ldarga_S, position); + /// A macro for Ldthis(); Ldfld(); Stloc(); private void Mvfldloc(FieldInfo ft, LocalBuilder lt) { @@ -359,8 +363,8 @@ private void CallToLower() } } - /// Generates the implementation for FindFirstChar. - protected void EmitFindFirstChar() + /// Generates the implementation for TryFindNextPossibleStartingPosition. + protected void EmitTryFindNextPossibleStartingPosition() { Debug.Assert(_code != null); _int32LocalsPool?.Clear(); @@ -475,7 +479,7 @@ bool GenerateAnchors() { Label label; - // Anchors that fully implement FindFirstChar, with a check that leads to immediate success or failure determination. + // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. switch (_code.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: @@ -1021,8 +1025,8 @@ void EmitLiteralAfterAtomicLoop() } } - /// Generates the implementation for Go. - protected void EmitGo() + /// Generates the implementation for TryMatchAtCurrentPosition. + protected void EmitTryMatchAtCurrentPosition() { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -1041,7 +1045,7 @@ protected void EmitGo() // label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly // where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to // the right location. In an expression without backtracking, or before any backtracking constructs have been encountered, - // "doneLabel" is simply the final return location from the Go method that will undo any captures and exit, signaling to + // "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. Debug.Assert(_code != null); @@ -1057,13 +1061,13 @@ protected void EmitGo() node = node.Child(0); - // In some limited cases, FindFirstChar will only return true if it successfully matched the whole expression. - // We can special case these to do essentially nothing in Go other than emit the capture. + // In some limited cases, TryFindNextPossibleStartingPosition will only return true if it successfully matched the whole expression. + // We can special case these to do essentially nothing in TryMatchAtCurrentPosition other than emit the capture. switch (node.Kind) { case RegexNodeKind.Multi or RegexNodeKind.Notone or RegexNodeKind.One or RegexNodeKind.Set when !IsCaseInsensitive(node): // This is the case for single and multiple characters, though the whole thing is only guaranteed - // to have been validated in FindFirstChar when doing case-sensitive comparison. + // to have been validated in TryFindNextPossibleStartingPosition when doing case-sensitive comparison. // base.Capture(0, base.runtextpos, base.runtextpos + node.Str.Length); // base.runtextpos = base.runtextpos + node.Str.Length; // return true; @@ -3963,21 +3967,17 @@ void EmitStackPop() } } - protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMethod) + protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, DynamicMethod tryMatchAtCurrentPositionMethod) { Label returnLabel = DefineLabel(); - // while (true) - Label whileLoopEnd = DefineLabel(); + // while (TryFindNextPossibleStartingPosition(text)) Label whileLoopBody = DefineLabel(); MarkLabel(whileLoopBody); - - // if (FindFirstChar(text)) - Label postWhileLabel = DefineLabel(); Ldthis(); Ldarg_1(); - Call(findFirstCharMethod); - BrfalseFar(postWhileLabel); + Call(tryFindNextStartingPositionMethod); + BrfalseFar(returnLabel); if (_hasTimeout) { @@ -3986,18 +3986,14 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho Call(s_checkTimeoutMethod); } - // if (Go(text)) + // if (TryMatchAtCurrentPosition(text) || runtextpos == text.length) // return; Ldthis(); Ldarg_1(); - Call(goMethod); + Call(tryMatchAtCurrentPositionMethod); BrtrueFar(returnLabel); - - // if (runtextpos == text.length) - // return; - MarkLabel(postWhileLabel); Ldthisfld(s_runtextposField); - _ilg!.Emit(OpCodes.Ldarga_S, 1); + Ldarga_s(1); Call(s_spanGetLengthMethod); Ceq(); BrtrueFar(returnLabel); @@ -4011,7 +4007,6 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho // End loop body. BrFar(whileLoopBody); - MarkLabel(whileLoopEnd); // return; MarkLabel(returnLabel); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index a7a8f16a00cda..e72a0148a575e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -337,33 +337,24 @@ protected internal override void Scan(ReadOnlySpan text) stoppos = 0; } - while (true) + while (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextbeg, runtextstart, runtextend)) { - if (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextbeg, runtextstart, runtextend)) - { - CheckTimeout(); - - if (Go(text)) - { - return; - } - - // Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; - } + CheckTimeout(); - if (runtextpos == stoppos) + if (TryMatchAtCurrentPosition(text) || runtextpos == stoppos) { return; } + // Reset state for another iteration. + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl!.Length; runtextpos += bump; } } - private bool Go(ReadOnlySpan inputSpan) + private bool TryMatchAtCurrentPosition(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); _codepos = 0; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index a204e55e002c6..fe467efd05a41 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -52,14 +52,14 @@ internal sealed class RegexLWCGCompiler : RegexCompiler description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); - EmitFindFirstChar(); + DynamicMethod tryfindNextPossibleStartPositionMethod = DefineDynamicMethod($"Regex{regexNum}_TryFindNextPossibleStartingPosition{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); + EmitTryFindNextPossibleStartingPosition(); - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); - EmitGo(); + DynamicMethod tryMatchAtCurrentPositionMethod = DefineDynamicMethod($"Regex{regexNum}_TryMatchAtCurrentPosition{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); + EmitTryMatchAtCurrentPosition(); DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); - EmitScan(findFirstCharMethod, goMethod); + EmitScan(tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); return new CompiledRegexRunnerFactory(scanMethod); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index e51c6efc55840..d6036405094af 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -465,8 +465,9 @@ protected bool IsBoundary(int index, int startpos, int endpos) internal bool IsBoundary(ReadOnlySpan inputSpan, int index) { - return (index > runtextbeg && RegexCharClass.IsBoundaryWordChar(inputSpan[index - 1])) != - (index < inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); + int indexM1 = index - 1; + return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[indexM1])) != + ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); } /// Called to determine a char's inclusion in the \w set. @@ -480,8 +481,9 @@ protected bool IsECMABoundary(int index, int startpos, int endpos) internal bool IsECMABoundary(ReadOnlySpan inputSpan, int index) { - return (index > runtextbeg && RegexCharClass.IsECMAWordChar(inputSpan[index - 1])) != - (index < inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index])); + int indexM1 = index - 1; + return ((uint)indexM1 < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[indexM1])) != + ((uint)index < (uint)inputSpan.Length && RegexCharClass.IsECMAWordChar(inputSpan[index])); } protected static bool CharInSet(char ch, string set, string category) From 94d24e3d98d1de22fd34ef844c103d3d017b4e6b Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Wed, 23 Feb 2022 11:17:19 -0800 Subject: [PATCH 10/17] Fix issue where runtextbeg and length do not match the size of the input --- .../System/Text/RegularExpressions/Regex.cs | 4 +++ .../Text/RegularExpressions/RegexRunner.cs | 34 ++++++++++++------- .../tests/PrecompiledRegexScenarioTest.cs | 4 +++ 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 969f25455f136..7fe1ae849169e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -400,6 +400,10 @@ protected void InitializeReferences() private static Match? InternalPerformScan(bool quick, string input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfQuick) { + // We need to save the original beginning of the input in case we are in the CompileToAssembly case, which relies on + // setting runtextbeg and runtextend correctly. We cannot calculate these values on the fly based on input and the span + // because it is not guaranteed that span.Length + beginning = input. + runner.originalRuntextbeg = beginning; runner.Scan(span); Match? match = runner.runmatch; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index d6036405094af..8240039b35e4f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -20,7 +20,9 @@ namespace System.Text.RegularExpressions { public abstract class RegexRunner { - protected internal int runtextbeg; // beginning of text to search + protected internal int runtextbeg; // beginning of text to search. We now always use a sliced span of the input + // from runtextbeg to runtextend, which means that runtextbeg is now always 0 except + // for CompiledToAssembly scenario which works over the original input. protected internal int runtextend; // end of text to search protected internal int runtextstart; // starting point for search @@ -57,6 +59,10 @@ public abstract class RegexRunner protected internal Match? runmatch; // result object protected internal Regex? runregex; // regex object + internal int originalRuntextbeg; // In the CompiledToAssembly case, it is important to store the original runtexbeg + // that was passed in from the user, mainly because it works over the original input + // as opposed to working over the sliced span. + // TODO: Expose something as protected internal: https://github.com/dotnet/runtime/issues/59629 private protected bool quick; // false if match details matter, true if only the fact that match occurred matters @@ -91,25 +97,29 @@ protected RegexRunner() { } protected internal virtual void Scan(ReadOnlySpan text) { string? s = runtext; - // If beginning was passed in, then runtext and span won't match lengths so calculate the beginning to be used for the comparison. - int beginning = (s != null && s.Length != text.Length) ? s.Length - text.Length : 0; - if (s == null || text != s.AsSpan(beginning, text.Length)) + // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario + // we require to use the complete input and to use the full string instead. We first test to ensure that the + // passed in span matches the original input by using the original runtextbeg. If that is not the case, + // then it means the user is calling the new span-based APIs using CompiledToAssembly, so we throw NSE + // so as to prevent a lot of unexpected allocations. + if (s == null || text != s.AsSpan(originalRuntextbeg, text.Length)) { // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called. throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly); } - // If beginning wasn't zero, then we have to adjust some of the - // internal fields of RegexRunner to ensure the Precompiled Go and FFC - // will work as expected. - if (beginning != 0) + // If the original beginning wasn't zero, then we have to adjust some of the + // internal fields of RegexRunner to ensure the Precompiled Go and FFC methods + // will continue to work as expected since they work over the original input, as opposed + // to using the sliced span. + if (originalRuntextbeg != 0) { - runtextbeg = beginning; - runtextstart += beginning; - runtextend += beginning; + runtextbeg = originalRuntextbeg; + runtextstart += originalRuntextbeg; + runtextend += originalRuntextbeg; } - Scan(runregex!, s, beginning, beginning + text.Length, runtextstart + beginning, -1, quick, runregex!.internalMatchTimeout); + Scan(runregex!, s, originalRuntextbeg, originalRuntextbeg + text.Length, runtextstart + originalRuntextbeg, -1, quick, runregex!.internalMatchTimeout); } protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index 35e91df8386bb..326a5417d9d0f 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -25,6 +25,8 @@ public void TestPrecompiledRegex() string[] expectedMatches = textWithMultipleMatches.Split(Environment.NewLine); RegexTestClass testClass = new RegexTestClass(); + + // Test Matches overloads Assert.Equal(1, testClass.Matches(text).Count); Assert.Equal(0, testClass.Matches(text, startat: 7).Count); @@ -40,6 +42,8 @@ public void TestPrecompiledRegex() Assert.Equal(1, testClass.Match(text).Groups[0].Captures.Count); Assert.Equal(Match.Empty, testClass.Match(text, beginning: 7, length: text.Length - 7)); Assert.Equal(5, testClass.Match(text, beginning: 5, length: text.Length - 5).Index); + Assert.False(testClass.Match("asdf134succes1245somethingasdf134success1245something", 0, 27).Success); // The first 27 characters shouldn't match. + Assert.True(testClass.Match("asdf134succes1245somethingasdf134success1245something", 26, 27).Success); // The last 27 characters should match. Assert.Equal(Match.Empty, testClass.Match(text, startat: 7)); Assert.Equal(6, testClass.Match(text, startat: 6).Index); From 62b5b2bbaeb74de990d4fb3f731c2d2f7a31111e Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Wed, 23 Feb 2022 15:04:46 -0800 Subject: [PATCH 11/17] Add Precompiled tests for all overloads of methods. --- .../System/Text/RegularExpressions/Regex.cs | 2 +- .../tests/PrecompiledRegexScenarioTest.cs | 64 ++++++++++++++++--- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 7fe1ae849169e..f151029790e29 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -487,11 +487,11 @@ internal void Run(string input, int startat, ref TState state, MatchCall try { runner.InitializeTimeout(internalMatchTimeout); - runner.runtext = input; int runtextpos = startat; while (true) { runner.InitializeForScan(this, input, startat, false); + runner.runtext = input; runner.runtextpos = runtextpos; int stoppos = RightToLeft ? 0 : input.Length; diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index 326a5417d9d0f..1ed5936006ca5 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -14,19 +14,18 @@ namespace System.Text.RegularExpressions.Tests { public class PrecompiledRegexScenarioTest { - [Fact] - public void TestPrecompiledRegex() - { - string text = "asdf134success1245something"; - string textWithMultipleMatches = @"asdf134success1245something + const string text = "asdf134success1245something"; + const string textWithMultipleMatches = @"asdf134success1245something bsdf135success1245somethingelse csdf136success2245somethingnew dsdf137success3245somethingold"; + + [Fact] + public void PrecompiledRegex_MatchesTest() + { string[] expectedMatches = textWithMultipleMatches.Split(Environment.NewLine); RegexTestClass testClass = new RegexTestClass(); - - // Test Matches overloads Assert.Equal(1, testClass.Matches(text).Count); Assert.Equal(0, testClass.Matches(text, startat: 7).Count); @@ -36,9 +35,13 @@ public void TestPrecompiledRegex() { Assert.Equal(expectedMatches[i], multipleMatches[i].Value.Trim()); // Calling Trim since the match will contain the new line as part of the match. } + } + [Fact] + public void PrecompiledRegex_MatchTest() + { + RegexTestClass testClass = new RegexTestClass(); - // Test Match overloads Assert.Equal(1, testClass.Match(text).Groups[0].Captures.Count); Assert.Equal(Match.Empty, testClass.Match(text, beginning: 7, length: text.Length - 7)); Assert.Equal(5, testClass.Match(text, beginning: 5, length: text.Length - 5).Index); @@ -46,11 +49,52 @@ public void TestPrecompiledRegex() Assert.True(testClass.Match("asdf134succes1245somethingasdf134success1245something", 26, 27).Success); // The last 27 characters should match. Assert.Equal(Match.Empty, testClass.Match(text, startat: 7)); Assert.Equal(6, testClass.Match(text, startat: 6).Index); + } + + [Fact] + public void PrecompiledRegex_ReplaceTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal("4success", testClass.Replace(text, "$1${output}")); + Assert.Equal("4success", testClass.Replace(text, (match) => + { + return $"{match.Groups[1]}{match.Groups["output"]}"; + })); + Assert.Equal("4success\n5success\n6success\n7success", testClass.Replace(textWithMultipleMatches, "$1${output}")); + } + + [Fact] + public void PrecompiledRegex_SplitTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal(new[] { "", "4", "success", "\n", "5", "success", "\n", "6", "success", "\n", "7", "success", "" }, testClass.Split(textWithMultipleMatches)); + Assert.Equal(new[] { "", "4", "success", "\nbsdf135success1245somethingelse\r\ncsdf136success2245somethingnew\r\ndsdf137success3245somethingold" }, testClass.Split(textWithMultipleMatches, 2)); + } + + [Fact] + public void PrecompiledRegex_CountTest() + { + RegexTestClass testClass = new RegexTestClass(); + + Assert.Equal(4, testClass.Count(textWithMultipleMatches)); + Assert.Equal(4, testClass.Count(textWithMultipleMatches)); + } + + [Fact] + public void PrecompiledRegex_ThrowsWhenSpanIsMatchIsCalled() + { + RegexTestClass testClass = new RegexTestClass(); - // Test Span-based IsMatch throws for Precompiled scenario. Assert.Throws(() => testClass.IsMatch(text.AsSpan())); + } + + [Fact] + public void PrecompiledRegex_Groups() + { + RegexTestClass testClass = new RegexTestClass(); - // Test Groups Assert.Equal(text, testClass.Match(text).Groups[0].Value); Assert.Equal(new int[] { 0, 1, 2 }, testClass.GetGroupNumbers()); Assert.Equal(new string[] { "0", "1", "output" }, testClass.GetGroupNames()); From f23c3f2bed4b301062557935166ab9241ed00f71 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 24 Feb 2022 11:19:53 -0800 Subject: [PATCH 12/17] Fix precompiled test failing in non-Windows due to new lines. --- .../tests/PrecompiledRegexScenarioTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs index 1ed5936006ca5..fa4a8a686c40a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/PrecompiledRegexScenarioTest.cs @@ -70,7 +70,7 @@ public void PrecompiledRegex_SplitTest() RegexTestClass testClass = new RegexTestClass(); Assert.Equal(new[] { "", "4", "success", "\n", "5", "success", "\n", "6", "success", "\n", "7", "success", "" }, testClass.Split(textWithMultipleMatches)); - Assert.Equal(new[] { "", "4", "success", "\nbsdf135success1245somethingelse\r\ncsdf136success2245somethingnew\r\ndsdf137success3245somethingold" }, testClass.Split(textWithMultipleMatches, 2)); + Assert.Equal(new[] { "", "4", "success", $"\nbsdf135success1245somethingelse{Environment.NewLine}csdf136success2245somethingnew{Environment.NewLine}dsdf137success3245somethingold" }, testClass.Split(textWithMultipleMatches, 2)); } [Fact] From 7f9351789be48bf27b151d77926d1753b8715c49 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 24 Feb 2022 14:24:32 -0800 Subject: [PATCH 13/17] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../gen/RegexGenerator.Emitter.cs | 6 ++-- .../CompiledRegexRunnerFactory.cs | 2 +- .../System/Text/RegularExpressions/Regex.cs | 29 ++++++++++--------- .../Text/RegularExpressions/RegexRunner.cs | 1 + 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index b57134a9c6c30..278cf9e3c7ddf 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -283,15 +283,12 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]"); writer.WriteLine($" private static bool IsBoundary(global::System.ReadOnlySpan inputSpan, int index)"); writer.WriteLine($" {{"); - writer.WriteLine($" const char ZeroWidthNonJoiner = '\\u200C', ZeroWidthJoiner = '\\u200D';"); - writer.WriteLine(); writer.WriteLine($" int indexM1 = index - 1;"); writer.WriteLine($" return ((uint)indexM1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexM1])) !="); writer.WriteLine($" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));"); writer.WriteLine(); writer.WriteLine($" static bool IsBoundaryWordChar(char ch) =>"); - writer.WriteLine($" IsWordChar(ch) ||"); - writer.WriteLine($" (ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner);"); + writer.WriteLine($" IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');"); writer.WriteLine($" }}"); } @@ -342,6 +339,7 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) } } + /// Emits the body of the Scan method override. private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) { using (EmitBlock(writer, "while (TryFindNextPossibleStartingPosition(text))")) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index ab5a5ed4f913e..4a9147e4d363b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -9,7 +9,7 @@ internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { private readonly DynamicMethod _scanMethod; - // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed. + // Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed. private CompiledRegexRunner.ScanDelegate? _scan; public CompiledRegexRunnerFactory(DynamicMethod scanMethod) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index f151029790e29..290458b8983b5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -407,8 +407,10 @@ protected void InitializeReferences() runner.Scan(span); Match? match = runner.runmatch; - // if we got a match, set runmatch to null if quick is true - if (match!.FoundMatch) + Debugger.Assert(match is not null); + + // If we got a match, set runmatch to null if quick is true. + if (match.FoundMatch) { runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache @@ -419,12 +421,14 @@ protected void InitializeReferences() if (quick && returnNullIfQuick) { - runner.runmatch!.Text = null; // Drop reference to text + match.Text = null; // Drop reference to text return null; } if (!quick) + { runner.runmatch = null; + } match.Tidy(runner.runtextpos); @@ -446,11 +450,7 @@ protected void InitializeReferences() } else { - - if (runner.runmatch != null) - { - runner.runmatch.Text = null; - } + match.Text = null; } return RegularExpressions.Match.Empty; @@ -497,9 +497,10 @@ internal void Run(string input, int startat, ref TState state, MatchCall int stoppos = RightToLeft ? 0 : input.Length; Match? match = InternalPerformScan(reuseMatchObject, input, 0, runner, input, returnNullIfQuick: false); + Debug.Assert(match is not null); // if we got a match, then call the callback function with the match and prepare for next iteration. - if (match!.Success) + if (match.Success) { if (!reuseMatchObject) { @@ -512,7 +513,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall { // If the callback returns false, we're done. // Drop reference to text to avoid keeping it alive in a cache. - runner.runtext = null!; + runner.runtext = null; if (reuseMatchObject) { @@ -520,7 +521,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall // We don't do this if we're not reusing instances, as in that case we're // dropping the whole reference to the match, and we no longer own the instance // having handed it out to the callback. - match.Text = null!; + match.Text = null; } return; } @@ -539,12 +540,12 @@ internal void Run(string input, int startat, ref TState state, MatchCall if (runner.runtextpos == stoppos) { // Drop reference to text to avoid keeping it alive in a cache. - runner.runtext = null!; + runner.runtext = null; if (reuseMatchObject) { // See above comment. - match.Text = null!; + match.Text = null; } return; } @@ -562,7 +563,9 @@ internal void Run(string input, int startat, ref TState state, MatchCall { runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache if (!reuseMatchObject) + { runner.runmatch = null; + } return; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 8240039b35e4f..fbf0efeae7058 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -97,6 +97,7 @@ protected RegexRunner() { } protected internal virtual void Scan(ReadOnlySpan text) { string? s = runtext; + // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario // we require to use the complete input and to use the full string instead. We first test to ensure that the // passed in span matches the original input by using the original runtextbeg. If that is not the case, From 08e2a113a2b652092775175a519ff4e67f636be0 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 24 Feb 2022 16:44:06 -0800 Subject: [PATCH 14/17] Addressing more PR Feedback --- .../System/Text/RegularExpressions/Regex.cs | 32 +-- .../Text/RegularExpressions/RegexRunner.cs | 193 +++--------------- 2 files changed, 39 insertions(+), 186 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 290458b8983b5..ed1a0fb9c1306 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -394,34 +394,28 @@ protected void InitializeReferences() } finally { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache. _runner = runner; } } private static Match? InternalPerformScan(bool quick, string input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfQuick) { - // We need to save the original beginning of the input in case we are in the CompileToAssembly case, which relies on - // setting runtextbeg and runtextend correctly. We cannot calculate these values on the fly based on input and the span - // because it is not guaranteed that span.Length + beginning = input. - runner.originalRuntextbeg = beginning; runner.Scan(span); Match? match = runner.runmatch; - Debugger.Assert(match is not null); + Debug.Assert(match is not null); // If we got a match, set runmatch to null if quick is true. if (match.FoundMatch) { - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache - - if (match.Text != input) + if (!quick) { - match.Text = input; + match.Text = input; // We need to save the input into the match object which will be returned. } if (quick && returnNullIfQuick) { - match.Text = null; // Drop reference to text return null; } @@ -432,9 +426,8 @@ protected void InitializeReferences() match.Tidy(runner.runtextpos); - // If there was a match and the original text was sliced, then add beginning to the index to get the real - // Index of the match. - if (match.Success && beginning != 0) + // If the passed in beginning was not 0 then we need to adjust the offests on the match object. + if (beginning != 0) { match.AddBeginningToIndex(beginning); } @@ -442,8 +435,6 @@ protected void InitializeReferences() return match; } - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache - if (!quick) { runner.runmatch = null; @@ -471,7 +462,7 @@ protected void InitializeReferences() runner.Scan(input); - return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty; + return runner.runmatch == null || runner.runmatch.FoundMatch ? null : RegularExpressions.Match.Empty; } finally { @@ -487,11 +478,11 @@ internal void Run(string input, int startat, ref TState state, MatchCall try { runner.InitializeTimeout(internalMatchTimeout); + runner.runtext = input; int runtextpos = startat; while (true) { runner.InitializeForScan(this, input, startat, false); - runner.runtext = input; runner.runtextpos = runtextpos; int stoppos = RightToLeft ? 0 : input.Length; @@ -512,8 +503,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall if (!callback(ref state, match)) { // If the callback returns false, we're done. - // Drop reference to text to avoid keeping it alive in a cache. - runner.runtext = null; if (reuseMatchObject) { @@ -539,9 +528,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall { if (runner.runtextpos == stoppos) { - // Drop reference to text to avoid keeping it alive in a cache. - runner.runtext = null; - if (reuseMatchObject) { // See above comment. @@ -561,7 +547,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall // We failed to match at this position. If we're at the stopping point, we're done. if (runner.runtextpos == stoppos) { - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache if (!reuseMatchObject) { runner.runmatch = null; @@ -573,6 +558,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall } finally { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache. _runner = runner; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index fbf0efeae7058..39b42c68b8247 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -20,10 +20,12 @@ namespace System.Text.RegularExpressions { public abstract class RegexRunner { - protected internal int runtextbeg; // beginning of text to search. We now always use a sliced span of the input + protected internal int runtextbeg; // Beginning of text to search. We now always use a sliced span of the input // from runtextbeg to runtextend, which means that runtextbeg is now always 0 except // for CompiledToAssembly scenario which works over the original input. - protected internal int runtextend; // end of text to search + protected internal int runtextend; // End of text to search. Because we now pass in a sliced span of the input into Scan, + // the runtextend will always match the length of that passed in span except for CompileToAssemby + // scenario, which still works over the original input. protected internal int runtextstart; // starting point for search protected internal string? runtext; // text to search @@ -59,10 +61,6 @@ public abstract class RegexRunner protected internal Match? runmatch; // result object protected internal Regex? runregex; // regex object - internal int originalRuntextbeg; // In the CompiledToAssembly case, it is important to store the original runtexbeg - // that was passed in from the user, mainly because it works over the original input - // as opposed to working over the sliced span. - // TODO: Expose something as protected internal: https://github.com/dotnet/runtime/issues/59629 private protected bool quick; // false if match details matter, true if only the fact that match occurred matters @@ -98,12 +96,18 @@ protected internal virtual void Scan(ReadOnlySpan text) { string? s = runtext; + // We can assume that the passed in 'text' span is a slice of the original text input runtext. That said we need to calculate + // what the original beginning was and can't do it by just using the lengths of text and runtext, since we can't guarantee that + // the passed in beginning and length match the size of the original input. We instead use MemoryExtensions Overlaps to find the + // offset in memory between them. We intentionally use s.Overlaps(text) since we want to get a positive value. + _ = s.AsSpan().Overlaps(text, out int beginning); + // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario // we require to use the complete input and to use the full string instead. We first test to ensure that the // passed in span matches the original input by using the original runtextbeg. If that is not the case, // then it means the user is calling the new span-based APIs using CompiledToAssembly, so we throw NSE // so as to prevent a lot of unexpected allocations. - if (s == null || text != s.AsSpan(originalRuntextbeg, text.Length)) + if (s == null || text != s.AsSpan(beginning, text.Length)) { // If we landed here then we are dealing with a CompiledToAssembly case where the new Span overloads are being called. throw new NotSupportedException(SR.UsingSpanAPIsWithCompiledToAssembly); @@ -113,14 +117,14 @@ protected internal virtual void Scan(ReadOnlySpan text) // internal fields of RegexRunner to ensure the Precompiled Go and FFC methods // will continue to work as expected since they work over the original input, as opposed // to using the sliced span. - if (originalRuntextbeg != 0) + if (beginning != 0) { - runtextbeg = originalRuntextbeg; - runtextstart += originalRuntextbeg; - runtextend += originalRuntextbeg; + runtextbeg = beginning; + runtextstart += beginning; + runtextend += beginning; } - Scan(runregex!, s, originalRuntextbeg, originalRuntextbeg + text.Length, runtextstart + originalRuntextbeg, -1, quick, runregex!.internalMatchTimeout); + Scan(runregex!, s, beginning, beginning + text.Length, runtextstart + beginning, -1, quick, runregex!.internalMatchTimeout); } protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) @@ -176,9 +180,9 @@ protected internal virtual void Scan(ReadOnlySpan text) internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textstart, bool quick) { - this.quick = quick; // Store remaining arguments into fields now that we're going to start the scan. // These are referenced by the derived runner. + this.quick = quick; runregex = regex; runtextstart = textstart; runtextbeg = 0; @@ -239,160 +243,23 @@ internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textst internal void InitializeTimeout(TimeSpan timeout) { // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) + _ignoreTimeout = true; + if (Regex.InfiniteMatchTimeout != timeout) { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; - } - } - - /// Enumerates all of the matches with the specified regex, invoking the callback for each. - /// - /// This optionally repeatedly hands out the same Match instance, updated with new information. - /// should be set to false if the Match object is handed out to user code. - /// - internal void ScanInternal(Regex regex, string text, int textstart, ref TState state, MatchCallback callback, bool reuseMatchObject, TimeSpan timeout) - { - quick = false; - - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) - { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; - } - - // Configure the additional value to "bump" the position along each time we loop around - // to call FindFirstChar again, as well as the stopping position for the loop. We generally - // bump by 1 and stop at text.Length, but if we're examining right-to-left, we instead bump - // by -1 and stop at 0. - int bump = 1, stoppos = text.Length; - if (regex.RightToLeft) - { - bump = -1; - stoppos = 0; - } - - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtextstart = runtextpos = textstart; - runtext = text; - runtextend = text.Length; - runtextbeg = 0; - - // Main loop: FindFirstChar/Go + bump until the ending position. - while (true) - { - // Find the next potential location for a match in the input. -#if DEBUG - Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling FindFirstChar at {nameof(runtextbeg)}={runtextbeg}, {nameof(runtextpos)}={runtextpos}, {nameof(runtextend)}={runtextend}"); -#endif - if (FindFirstChar()) - { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } + ConfigureTimeout(timeout); -#if DEBUG - Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling Go at {nameof(runtextpos)}={runtextpos}"); -#endif - - // See if there's a match at this position. - Go(); - - // See if we have a match. - Match match = runmatch!; - if (match.FoundMatch) - { - // Hand it out to the callback in canonical form. - if (!reuseMatchObject) - { - // We're not reusing match objects, so null out our field reference to the instance. - // It'll be recreated the next time one is needed. - runmatch = null; - } - match.Tidy(runtextpos); - if (!callback(ref state, match)) - { - // If the callback returns false, we're done. - // Drop reference to text to avoid keeping it alive in a cache. - runtext = null!; - if (reuseMatchObject) - { - // We're reusing the single match instance, so clear out its text as well. - // We don't do this if we're not reusing instances, as in that case we're - // dropping the whole reference to the match, and we no longer own the instance - // having handed it out to the callback. - match.Text = null!; - } - return; - } - - // Now that we've matched successfully, update the starting position to reflect - // the current position, just as Match.NextMatch() would pass in _textpos as textstart. - runtextstart = runtextpos; - - // Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; - if (match.Length == 0) - { - if (runtextpos == stoppos) - { - // Drop reference to text to avoid keeping it alive in a cache. - runtext = null!; - if (reuseMatchObject) - { - // See above comment. - match.Text = null!; - } - return; - } - - runtextpos += bump; - } - - // Loop around to perform next match from where we left off. - continue; - } - - // Ran Go but it didn't find a match. Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; - } - - // We failed to match at this position. If we're at the stopping point, we're done. - if (runtextpos == stoppos) + void ConfigureTimeout(TimeSpan timeout) { - runtext = null; // drop reference to text to avoid keeping it alive in a cache - if (runmatch != null) - { - runmatch.Text = null!; - } - return; + // We are using Environment.TickCount and not Stopwatch for performance reasons. + // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt + // overflow it will still stay ahead of Environment.TickCount for comparisons made + // in DoCheckTimeout(). + Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected + _ignoreTimeout = false; + _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; + _timeoutOccursAt = Environment.TickCount + _timeout; + _timeoutChecksToSkip = TimeoutCheckFrequency; } - - // Bump by one (in whichever direction is appropriate) and loop to go again. - runtextpos += bump; } } From 9b5d70492e65ba293137141b5def7c88eb906a94 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Thu, 24 Feb 2022 23:03:58 -0800 Subject: [PATCH 15/17] Fix protected Scan method --- .../Text/RegularExpressions/RegexRunner.cs | 62 ++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 39b42c68b8247..0304c10f9e58d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -124,10 +124,69 @@ protected internal virtual void Scan(ReadOnlySpan text) runtextend += beginning; } - Scan(runregex!, s, beginning, beginning + text.Length, runtextstart + beginning, -1, quick, runregex!.internalMatchTimeout); + InternalScan(runregex!, beginning, beginning + text.Length); } + /// + /// This method's body is only kept since it is a protected member that could be called by someone outside + /// the assembly. + /// protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + { + InitializeTimeout(timeout); + + // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text + runtext = text; + + InitializeForScan(regex, text, textstart, quick); + + // InitializeForScan will default runtextstart and runtextend to 0 and length of string + // since it is configured to work over a sliced portion of text so we adjust those values. + runtextstart = textstart; + runtextend = textend; + + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = textend; + if (regex.RightToLeft) + { + bump = -1; + stoppos = textbeg; + } + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (textstart == stoppos) + { + return Match.Empty; + } + + runtextpos += bump; + } + + Match match = InternalScan(regex, textbeg, textend); + runtext = null; //drop reference + runmatch!.Text = null; + + if (match.FoundMatch) + { + if (quick) + { + return null; + } + + runmatch = null; + match.Tidy(runtextpos); + } + + return match; + + } + + private Match InternalScan(Regex regex, int textbeg, int textend) { // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally @@ -254,7 +313,6 @@ void ConfigureTimeout(TimeSpan timeout) // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt // overflow it will still stay ahead of Environment.TickCount for comparisons made // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected _ignoreTimeout = false; _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; _timeoutOccursAt = Environment.TickCount + _timeout; From 47d312f9809bd487436df65541b94f3e1ed20f52 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 25 Feb 2022 15:45:18 -0800 Subject: [PATCH 16/17] Addressing feedback and adding more tests --- .../System/Text/RegularExpressions/Match.cs | 8 + .../Text/RegularExpressions/Regex.Match.cs | 51 +-- .../System/Text/RegularExpressions/Regex.cs | 43 +-- .../RegularExpressions/RegexInterpreter.cs | 13 +- .../Text/RegularExpressions/RegexRunner.cs | 13 +- .../tests/CustomDerivedRegexScenarioTest.cs | 313 ++++++++++++++++++ .../tests/Regex.Match.Tests.cs | 8 + .../tests/RegexRunnerTests.cs | 64 ++++ ...ystem.Text.RegularExpressions.Tests.csproj | 2 + 9 files changed, 463 insertions(+), 52 deletions(-) create mode 100644 src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs create mode 100644 src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index fb06af8faafee..8ae239aac9b3d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -84,6 +84,14 @@ internal void Reset(Regex regex, string? text, int textbeg, int textend, int tex _groupcoll?.Reset(); } + /// + /// Returns if this object represents a successful match, and otherwise. + /// + /// + /// The main difference between the public property and this one, is that requires + /// for a to call first, in order to report the correct value, while this API will return + /// the correct value right after a Match gets calculated, meaning that it will return right after + /// internal bool FoundMatch => _matchcount[0] > 0; public virtual GroupCollection Groups => _groupcoll ??= new GroupCollection(this, null); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index af023dd99c58d..4bf6af10683fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -14,11 +14,14 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg RegexCache.GetOrAdd(pattern).IsMatch(input); /// - /// Searches the input span for one or more occurrences of the text supplied in the given pattern. - /// - /// The input span to be searched on. - /// The Regex pattern to be used for matching. - /// if the input matches the pattern, otherwise. + /// Indicates whether the specified regular expression finds a match in the specified input span. + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => RegexCache.GetOrAdd(pattern).IsMatch(input); @@ -31,12 +34,16 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); /// - /// Searches the input span for one or more occurrences of the text supplied in the given pattern. It uses the passed in options. + /// Indicates whether the specified regular expression finds a match in the specified input span, using the specified matching options. /// - /// The input span to be searched on. - /// The Regex pattern to be used for matching. - /// The options to be used for matching - /// if the input matches the pattern, otherwise. + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that provide options for matching. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. + /// is not in a valid value. public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); @@ -44,13 +51,18 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); /// - /// Searches the input span for one or more occurrences of the text supplied in the given pattern under the specified timeout. It uses the passed in options. + /// Indicates whether the specified regular expression finds a match in the specified input span, using the specified matching options and time-out interval. /// - /// The input span to be searched on. - /// The Regex pattern to be used for matching. - /// The options to be used for matching - /// Max time to be used for matching before returning. - /// if the input matches the pattern, otherwise. Also returns for time out. + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that provide options for matching. + /// A time-out interval, or to indicate that the method should not time out. + /// if the regular expression finds a match; otherwise, . + /// A regular expression parsing error occurred. + /// is + /// A time-out occurred. + /// is not in a valid value or is negative, + /// zero, or greater than approximately 24 days. public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); @@ -69,10 +81,11 @@ public bool IsMatch(string input) } /// - /// Searches the input span for one or more matches using the previous pattern, - /// options, and starting position. + /// Indicates whether the regular expression specified in the Regex constructor finds a match in a specified input span. /// - /// if the input matches the pattern, otherwise. + /// The span to search for a match. + /// if the regular expression finds a match; otherwise, . + /// A time-out ocurred. public bool IsMatch(ReadOnlySpan input) => Run(input, UseOptionR() ? input.Length : 0) is null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index ed1a0fb9c1306..c07558b20f1c5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -406,27 +406,24 @@ protected void InitializeReferences() Match? match = runner.runmatch; Debug.Assert(match is not null); - // If we got a match, set runmatch to null if quick is true. + // If we got a match, do some cleanup and return it, or return null if quick is true; if (match.FoundMatch) { if (!quick) { - match.Text = input; // We need to save the input into the match object which will be returned. + // We're about to return the Match object. Store the input into it and remove it from the runner. + match.Text = input; + runner.runmatch = null; } - - if (quick && returnNullIfQuick) + else if (returnNullIfQuick) { + match.Text = null; return null; } - if (!quick) - { - runner.runmatch = null; - } - match.Tidy(runner.runtextpos); - // If the passed in beginning was not 0 then we need to adjust the offests on the match object. + // If the passed in beginning was not 0 then we need to adjust the offsets on the match object. if (beginning != 0) { match.AddBeginningToIndex(beginning); @@ -435,24 +432,18 @@ protected void InitializeReferences() return match; } - if (!quick) - { - runner.runmatch = null; - } - else - { - match.Text = null; - } + // We failed to match, so we will return Match.Empty which means we can reuse runmatch object. + // We do however need to clear its Text in case it was set, so as to not keep it alive in some cache. + runner.runmatch!.Text = null; return RegularExpressions.Match.Empty; } internal Match? Run(ReadOnlySpan input, int startat) { - if ((uint)startat > (uint)input.Length) - { - ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); - } + // startat parameter is always either 0 or input.Length since public API for IsMatch doesn't have an overload + // that takes in startat. + Debug.Assert(startat <= input.Length); RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try @@ -462,7 +453,9 @@ protected void InitializeReferences() runner.Scan(input); - return runner.runmatch == null || runner.runmatch.FoundMatch ? null : RegularExpressions.Match.Empty; + // If runmatch is null it means that an override of Scan didn't implement it correctly, so we will + // let this null ref since there are lots of ways where you can end up in a erroneous state. + return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty; } finally { @@ -547,10 +540,6 @@ internal void Run(string input, int startat, ref TState state, MatchCall // We failed to match at this position. If we're at the stopping point, we're done. if (runner.runtextpos == stoppos) { - if (!reuseMatchObject) - { - runner.runmatch = null; - } return; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index e72a0148a575e..1092db83c243f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -326,12 +326,17 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) protected internal override void Scan(ReadOnlySpan text) { + Debug.Assert(runregex is not null); + Debug.Assert(runtrack is not null); + Debug.Assert(runstack is not null); + Debug.Assert(runcrawl is not null); + // Configure the additional value to "bump" the position along each time we loop around // to call TryFindNextStartingPosition again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump // by -1 and stop at textbeg. int bump = 1, stoppos = text.Length; - if (runregex!.RightToLeft) + if (runregex.RightToLeft) { bump = -1; stoppos = 0; @@ -347,9 +352,9 @@ protected internal override void Scan(ReadOnlySpan text) } // Reset state for another iteration. - runtrackpos = runtrack!.Length; - runstackpos = runstack!.Length; - runcrawlpos = runcrawl!.Length; + runtrackpos = runtrack.Length; + runstackpos = runstack.Length; + runcrawlpos = runcrawl.Length; runtextpos += bump; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 0304c10f9e58d..13a20bb44c60a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -94,13 +94,19 @@ protected RegexRunner() { } protected internal virtual void Scan(ReadOnlySpan text) { + // This base implementation is overridden by all of the built-in engines and by all source-generated + // implementations. The only time this should end up being used is if someone is using a Regex-derived + // type created by .NET Framework's Regex.CompileToAssembly, in which case it will have overridden + // FindFirstChar and Go but not Scan (which didn't exist yet). This isn't an officially supported configuration, + // using assemblies built for .NET Framework and targeting .NET Framework surface area against this + // implementation, but we make a best-effort to keep things functional. string? s = runtext; // We can assume that the passed in 'text' span is a slice of the original text input runtext. That said we need to calculate // what the original beginning was and can't do it by just using the lengths of text and runtext, since we can't guarantee that // the passed in beginning and length match the size of the original input. We instead use MemoryExtensions Overlaps to find the // offset in memory between them. We intentionally use s.Overlaps(text) since we want to get a positive value. - _ = s.AsSpan().Overlaps(text, out int beginning); + s.AsSpan().Overlaps(text, out int beginning); // The passed in span is sliced from runtextbeg to runtextend already, but in the precompiled scenario // we require to use the complete input and to use the full string instead. We first test to ensure that the @@ -169,7 +175,6 @@ protected internal virtual void Scan(ReadOnlySpan text) Match match = InternalScan(regex, textbeg, textend); runtext = null; //drop reference - runmatch!.Text = null; if (match.FoundMatch) { @@ -181,6 +186,10 @@ protected internal virtual void Scan(ReadOnlySpan text) runmatch = null; match.Tidy(runtextpos); } + else + { + runmatch!.Text = null; + } return match; diff --git a/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs new file mode 100644 index 0000000000000..5f40a3c2e56fc --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/CustomDerivedRegexScenarioTest.cs @@ -0,0 +1,313 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Threading; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class CustomDerivedRegexScenarioTest + { + [Fact] + public void CallProtectedScanMethodFromCustomDerivedRegex() + { + CustomDerivedRegex regex = new(); + Assert.True(regex.CallScanDirectly(regex, "3456", 0, 4, 0, -1, false).Success); + Assert.False(regex.CallScanDirectly(regex, "456", 0, 3, 0, -1, false).Success); + Assert.Equal("45", regex.CallScanDirectly(regex, "45456", 0, 5, 0, -1, false).Value); + Assert.Equal("896", regex.CallScanDirectly(regex, "45896456", 0, 8, 2, -1, false).Value); + Assert.Equal(Match.Empty, regex.CallScanDirectly(regex, "I dont match", 0, 12, 0, -1, false)); + Assert.Null(regex.CallScanDirectly(regex, "3456", 0, 4, 0, -1, true)); + } + + } + + /// + /// This type was generated using an earlier version of the Regex Source Generator which still overrides Go and FindFirstChar. + /// The purpose of this class is to validate that if a derived RegexRunner is invoking the protected Scan methods, they should call + /// the overridden Go and FindFirstChar methods and return the expected results. + /// + internal class CustomDerivedRegex : Regex + { + private CustomRegexRunnerFactory.CustomRegexRunner runner; + + public CustomDerivedRegex() + { + pattern = /*lang=regex*/@"\G(\d{1,3})(?=(?:\d{3})+\b)"; + roptions = RegexOptions.Compiled; + internalMatchTimeout = Timeout.InfiniteTimeSpan; + factory = new CustomRegexRunnerFactory(); + capsize = 2; + MethodInfo createRunnerMethod = typeof(Regex).GetMethod("CreateRunner", BindingFlags.Instance | BindingFlags.NonPublic); + runner = createRunnerMethod.Invoke(this, new object[] { }) as CustomRegexRunnerFactory.CustomRegexRunner; + } + + public Match? CallScanDirectly(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) + => runner.CallScanDirectly(regex, text, textbeg, textend, textstart, prevlen, quick); + + internal class CustomRegexRunnerFactory : RegexRunnerFactory + { + protected override RegexRunner CreateInstance() => new CustomRegexRunner(); + + internal class CustomRegexRunner : RegexRunner + { + public Match? CallScanDirectly(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) + => Scan(regex, text, textbeg, textend, textstart, prevlen, quick); + + protected override void InitTrackCount() => base.runtrackcount = 12; + + // Description: + // ○ Match if at the start position. + // ○ 1st capture group. + // ○ Match a Unicode digit greedily at least 1 and at most 3 times. + // ○ Zero-width positive lookahead assertion. + // ○ Loop greedily at least once. + // ○ Match a Unicode digit exactly 3 times. + // ○ Match if at a word boundary. + + protected override bool FindFirstChar() + { + int pos = runtextpos, end = runtextend; + + if (pos < end) + { + // Start \G anchor + if (pos > runtextstart) + { + goto NoStartingPositionFound; + } + return true; + } + + // No starting position found + NoStartingPositionFound: + runtextpos = end; + return false; + } + + protected override void Go() + { + ReadOnlySpan inputSpan = runtext.AsSpan(); + int pos = base.runtextpos, end = base.runtextend; + int original_pos = pos; + int charloop_starting_pos = 0, charloop_ending_pos = 0; + int loop_iteration = 0, loop_starting_pos = 0; + int stackpos = 0; + int start = base.runtextstart; + ReadOnlySpan slice = inputSpan.Slice(pos, end - pos); + + // Match if at the start position. + { + if (pos != start) + { + goto NoMatch; + } + } + + // 1st capture group. + //{ + int capture_starting_pos = pos; + + // Match a Unicode digit greedily at least 1 and at most 3 times. + //{ + charloop_starting_pos = pos; + + int iteration = 0; + while (iteration < 3 && (uint)iteration < (uint)slice.Length && char.IsDigit(slice[iteration])) + { + iteration++; + } + + if (iteration == 0) + { + goto NoMatch; + } + + slice = slice.Slice(iteration); + pos += iteration; + + charloop_ending_pos = pos; + charloop_starting_pos++; + goto CharLoopEnd; + + CharLoopBacktrack: + UncaptureUntil(base.runstack![--stackpos]); + StackPop2(base.runstack, ref stackpos, out charloop_ending_pos, out charloop_starting_pos); + + if (charloop_starting_pos >= charloop_ending_pos) + { + goto NoMatch; + } + pos = --charloop_ending_pos; + slice = inputSpan.Slice(pos, end - pos); + + CharLoopEnd: + StackPush3(ref base.runstack!, ref stackpos, charloop_starting_pos, charloop_ending_pos, base.Crawlpos()); + //} + + base.Capture(1, capture_starting_pos, pos); + + StackPush1(ref base.runstack!, ref stackpos, capture_starting_pos); + goto SkipBacktrack; + + CaptureBacktrack: + capture_starting_pos = base.runstack![--stackpos]; + goto CharLoopBacktrack; + + SkipBacktrack:; + //} + + // Zero-width positive lookahead assertion. + { + int positivelookahead_starting_pos = pos; + + // Loop greedily at least once. + //{ + loop_iteration = 0; + loop_starting_pos = pos; + + LoopBody: + StackPush3(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos); + + loop_starting_pos = pos; + loop_iteration++; + + // Match a Unicode digit exactly 3 times. + { + if ((uint)slice.Length < 3 || + !char.IsDigit(slice[0]) || + !char.IsDigit(slice[1]) || + !char.IsDigit(slice[2])) + { + goto LoopIterationNoMatch; + } + } + + pos += 3; + slice = slice.Slice(3); + if (pos != loop_starting_pos || loop_iteration == 0) + { + goto LoopBody; + } + goto LoopEnd; + + LoopIterationNoMatch: + loop_iteration--; + if (loop_iteration < 0) + { + goto CaptureBacktrack; + } + StackPop2(base.runstack, ref stackpos, out pos, out loop_starting_pos); + UncaptureUntil(base.runstack![--stackpos]); + slice = inputSpan.Slice(pos, end - pos); + if (loop_iteration == 0) + { + goto CaptureBacktrack; + } + if (loop_iteration == 0) + { + goto CaptureBacktrack; + } + LoopEnd:; + //} + + // Match if at a word boundary. + { + if (!base.IsBoundary(pos, base.runtextbeg, end)) + { + goto LoopIterationNoMatch; + } + } + + pos = positivelookahead_starting_pos; + slice = inputSpan.Slice(pos, end - pos); + } + + // The input matched. + base.runtextpos = pos; + base.Capture(0, original_pos, pos); + return; + + // The input didn't match. + NoMatch: + UncaptureUntil(0); + return; + + // Pop 2 values from the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPop2(int[] stack, ref int pos, out int arg0, out int arg1) + { + arg0 = stack[--pos]; + arg1 = stack[--pos]; + } + + // Push 1 value onto the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPush1(ref int[] stack, ref int pos, int arg0) + { + // If there's space available for the value, store it. + int[] s = stack; + int p = pos; + if ((uint)p < (uint)s.Length) + { + s[p] = arg0; + pos++; + return; + } + + // Otherwise, resize the stack to make room and try again. + WithResize(ref stack, ref pos, arg0); + + // Resize the backtracking stack array and push 1 value onto the stack. + [MethodImpl(MethodImplOptions.NoInlining)] + static void WithResize(ref int[] stack, ref int pos, int arg0) + { + Array.Resize(ref stack, (pos + 0) * 2); + StackPush1(ref stack, ref pos, arg0); + } + } + + // Push 3 values onto the backtracking stack. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void StackPush3(ref int[] stack, ref int pos, int arg0, int arg1, int arg2) + { + // If there's space available for all 3 values, store them. + int[] s = stack; + int p = pos; + if ((uint)(p + 2) < (uint)s.Length) + { + s[p] = arg0; + s[p + 1] = arg1; + s[p + 2] = arg2; + pos += 3; + return; + } + + // Otherwise, resize the stack to make room and try again. + WithResize(ref stack, ref pos, arg0, arg1, arg2); + + // Resize the backtracking stack array and push 3 values onto the stack. + [MethodImpl(MethodImplOptions.NoInlining)] + static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2) + { + Array.Resize(ref stack, (pos + 2) * 2); + StackPush3(ref stack, ref pos, arg0, arg1, arg2); + } + } + + // Undo captures until we reach the specified capture position. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void UncaptureUntil(int capturepos) + { + while (base.Crawlpos() > capturepos) + { + base.Uncapture(); + } + } + } + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 5463ea85c0de4..2bcc0bf4b467c 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -2035,8 +2035,16 @@ private static void VerifyIsMatch(Regex? r, string input, bool expected, TimeSpa if (r == null) { Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input, pattern, options) : Regex.IsMatch(input, pattern, options, timeout)); + if (options == RegexOptions.None) + { + Assert.Equal(expected, Regex.IsMatch(input, pattern)); + } #if NET7_0_OR_GREATER Assert.Equal(expected, timeout == Regex.InfiniteMatchTimeout ? Regex.IsMatch(input.AsSpan(), pattern, options) : Regex.IsMatch(input.AsSpan(), pattern, options, timeout)); + if (options == RegexOptions.None) + { + Assert.Equal(expected, Regex.IsMatch(input.AsSpan(), pattern)); + } #endif } else diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs new file mode 100644 index 0000000000000..46343cc55de56 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Reflection; +using System.Threading.Tasks; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexRunnerTests + { + [Theory] + [InlineData(RegexEngine.Interpreter)] + [InlineData(RegexEngine.Compiled)] + [InlineData(RegexEngine.SourceGenerated)] + [InlineData(RegexEngine.NonBacktracking)] + public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) + { + Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + + // Use reflection to ensure the runner is created so it can be fetched. + MethodInfo createRunnerMethod = typeof(Regex).GetMethod("CreateRunner", BindingFlags.Instance | BindingFlags.NonPublic); + RegexRunner runner = createRunnerMethod.Invoke(re, new object[] { }) as RegexRunner; + + // Use reflection to call Go and FFC and ensure it throws NotImplementedException + MethodInfo goMethod = typeof(RegexRunner).GetMethod("Go", BindingFlags.Instance | BindingFlags.NonPublic); + MethodInfo ffcMethod = typeof(RegexRunner).GetMethod("FindFirstChar", BindingFlags.Instance | BindingFlags.NonPublic); + + // FindFirstChar and Go methods should not be implemented since built-in engines should be overriding and using Scan instead. + TargetInvocationException goInvocationException = Assert.Throws(() => goMethod.Invoke(runner, new object[] { })); + Assert.Equal(typeof(NotImplementedException), goInvocationException.InnerException.GetType()); + TargetInvocationException ffcInvocationException = Assert.Throws(() => ffcMethod.Invoke(runner, new object[] { })); + Assert.Equal(typeof(NotImplementedException), ffcInvocationException.InnerException.GetType()); + } + + [Theory] + [InlineData(RegexEngine.Interpreter)] + [InlineData(RegexEngine.Compiled)] + [InlineData(RegexEngine.SourceGenerated)] + [InlineData(RegexEngine.NonBacktracking)] + public async Task EnsureRunmatchValueIsNulledAfterIsMatch(RegexEngine engine) + { + Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + + // First call IsMatch which should initialize runmatch on the runner. + Assert.True(re.IsMatch("abcabcabc")); + + // Ensure runmatch wasn't nulled out, since after calling IsMatch it should be reused. + FieldInfo runnerField = typeof(Regex).GetField("_runner", BindingFlags.Instance | BindingFlags.NonPublic); + RegexRunner runner = runnerField.GetValue(re) as RegexRunner; + FieldInfo runmatchField = typeof(RegexRunner).GetField("runmatch", BindingFlags.Instance | BindingFlags.NonPublic); + Match runmatch = runmatchField.GetValue(runner) as Match; + Assert.NotNull(runmatch); + + // Ensure that the Value of runmatch was nulled out, so as to not keep a reference to it in a cache. + PropertyInfo textProperty = typeof(Match).GetProperty("Text", BindingFlags.Instance | BindingFlags.NonPublic); + Assert.Null(textProperty.GetValue(runmatch)); + Assert.Equal(string.Empty, runmatch.Value); +#if NET7_0_OR_GREATER + Assert.True(runmatch.ValueSpan == ReadOnlySpan.Empty); +#endif + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index d95fa07ad73f1..d382adaad11c6 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -11,6 +11,7 @@ + @@ -32,6 +33,7 @@ + From 7dc8ca70e0284d28b1a443fef5dbb31d290945e9 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 25 Feb 2022 20:58:20 -0800 Subject: [PATCH 17/17] Fix few failing tests in Mono runtime --- .../tests/Regex.Match.Tests.cs | 8 ++++---- .../tests/RegexRunnerTests.cs | 14 ++++---------- .../System.Text.RegularExpressions.Tests.csproj | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 2bcc0bf4b467c..e839f6fcf8131 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1011,20 +1011,20 @@ public void Match_DefaultTimeout_Throws(RegexOptions options) if ((RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture) == RegexOptions.None) { Assert.Throws(() => new Regex(Pattern).Match(input)); - VerifyIsMatchThrows(new Regex(Pattern), input, Regex.InfiniteMatchTimeout); + Assert.Throws(() => new Regex(Pattern).IsMatch(input)); Assert.Throws(() => new Regex(Pattern).Matches(input).Count); Assert.Throws(() => Regex.Match(input, Pattern)); - VerifyIsMatchThrows(null, input, Regex.InfiniteMatchTimeout, Pattern); + Assert.Throws(() => Regex.IsMatch(input, Pattern)); Assert.Throws(() => Regex.Matches(input, Pattern).Count); } Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Match(input)); - VerifyIsMatchThrows(new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)), input, Regex.InfiniteMatchTimeout); + Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).IsMatch(input)); Assert.Throws(() => new Regex(Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Matches(input).Count); Assert.Throws(() => Regex.Match(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture))); - VerifyIsMatchThrows(null, input, Regex.InfiniteMatchTimeout, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)); + Assert.Throws(() => Regex.IsMatch(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture))); Assert.Throws(() => Regex.Matches(input, Pattern, (RegexOptions)int.Parse(optionsString, CultureInfo.InvariantCulture)).Count); }, ((int)options).ToString(CultureInfo.InvariantCulture)).Dispose(); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs index 46343cc55de56..82f2ae5a0336b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexRunnerTests.cs @@ -10,10 +10,7 @@ namespace System.Text.RegularExpressions.Tests public class RegexRunnerTests { [Theory] - [InlineData(RegexEngine.Interpreter)] - [InlineData(RegexEngine.Compiled)] - [InlineData(RegexEngine.SourceGenerated)] - [InlineData(RegexEngine.NonBacktracking)] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) { Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); @@ -34,10 +31,7 @@ public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) } [Theory] - [InlineData(RegexEngine.Interpreter)] - [InlineData(RegexEngine.Compiled)] - [InlineData(RegexEngine.SourceGenerated)] - [InlineData(RegexEngine.NonBacktracking)] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task EnsureRunmatchValueIsNulledAfterIsMatch(RegexEngine engine) { Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); @@ -53,8 +47,8 @@ public async Task EnsureRunmatchValueIsNulledAfterIsMatch(RegexEngine engine) Assert.NotNull(runmatch); // Ensure that the Value of runmatch was nulled out, so as to not keep a reference to it in a cache. - PropertyInfo textProperty = typeof(Match).GetProperty("Text", BindingFlags.Instance | BindingFlags.NonPublic); - Assert.Null(textProperty.GetValue(runmatch)); + MethodInfo getTextMethod = typeof(Match).GetMethod("get_Text", BindingFlags.Instance | BindingFlags.NonPublic); + Assert.Null(getTextMethod.Invoke(runmatch, new object[] { })); Assert.Equal(string.Empty, runmatch.Value); #if NET7_0_OR_GREATER Assert.True(runmatch.ValueSpan == ReadOnlySpan.Empty); diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index d382adaad11c6..951257ab4b5b6 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -11,7 +11,6 @@ - @@ -33,7 +32,6 @@ - @@ -43,6 +41,8 @@ + +