From 7eb749c8b78609865edcad67f57084aa382632a3 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Sun, 18 Jul 2021 06:06:23 -0700 Subject: [PATCH] Eliminate backtracking in the interpreter for patterns with .* (#51508) * First cut of look up table for speeding up Go() * More efficient .* in RegexInterpreter * sq * Get more debug info * Remove assert and add unit test * Potential unit test * temp * Fix a bug * sq * Add extra protection to the backtracking optimization * Add unit test * Revert * RegexCompiler changes * sq * Remove debug unit tests * Add a length to the AsSpan call * Address RegexCompiler comments and add unit tests --- .../Text/RegularExpressions/RegexCompiler.cs | 140 +++++++++++++++--- .../RegularExpressions/RegexInterpreter.cs | 27 ++++ .../tests/Regex.Match.Tests.cs | 61 +++++++- 3 files changed, 210 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 81f5ca065f0e1..25b3a18b4c841 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -64,6 +64,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; + private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!; @@ -90,6 +91,7 @@ internal abstract class RegexCompiler private LocalBuilder? _runstackLocal; private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop + private LocalBuilder? _maxBacktrackPositionLocal; protected RegexOptions _options; // options protected RegexCode? _code; // the RegexCode object @@ -891,6 +893,8 @@ private void GenerateForwardSection() Mvfldloc(s_runtrackposField, _runtrackposLocal!); Mvfldloc(s_runstackField, _runstackLocal!); Mvfldloc(s_runstackposField, _runstackposLocal!); + Ldc(-1); + Stloc(_maxBacktrackPositionLocal!); _backpos = -1; @@ -1705,7 +1709,7 @@ protected void GenerateFindFirstChar() // if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse; // ... Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++) + for (; charClassIndex < _leadingCharClasses.Length; charClassIndex++) { Debug.Assert(needLoop); Ldloca(textSpanLocal); @@ -3310,6 +3314,7 @@ protected void GenerateGo() } _runtextbegLocal = DeclareInt32(); _runtextendLocal = DeclareInt32(); + _maxBacktrackPositionLocal = DeclareInt32(); InitializeCultureForGoIfNecessary(); @@ -4258,7 +4263,61 @@ private void GenerateOneCode() //: break Backward; { string str = _strings![Operand(0)]; + Label multiCode = DefineLabel(); + if (!IsRightToLeft()) + { + // if (runtextend - runtextpos < c) + Ldloc(_runtextendLocal!); + Ldloc(_runtextposLocal!); + Sub(); + Ldc(str.Length); + BgeFar(multiCode); + // if (!caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + if (!IsCaseInsensitive()) + { + Ldloc(_maxBacktrackPositionLocal!); + Ldc(-1); + BeqFar(_backtrack); + Ldloc(_runtextposLocal!); + Ldloc(_maxBacktrackPositionLocal!); + BleFar(_backtrack); + // runtextpos = _maxBacktrackPosition; + Ldloc(_maxBacktrackPositionLocal!); + Stloc(_runtextposLocal!); + // ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktractPosition); + Ldloc(_runtextLocal!); + Ldloc(_maxBacktrackPositionLocal!); + Ldloc(_runtextendLocal!); + Ldloc(_maxBacktrackPositionLocal!); + Sub(); + using (RentedLocalBuilder runtextSpanLocal = RentReadOnlySpanCharLocal()) + { + Call(s_stringAsSpanIntIntMethod); + Stloc(runtextSpanLocal); + using (RentedLocalBuilder lastIndexOfLocal = RentInt32Local()) + { + // int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan()); + Ldloc(runtextSpanLocal); + Ldstr(str); + Call(s_stringAsSpanMethod); + Call(s_spanLastIndexOfMethod); + Stloc(lastIndexOfLocal); + // if (lastIndexOf > -1) + Ldloc(lastIndexOfLocal); + Ldc(-1); + BleFar(_backtrack); + // runtextpos = lastIndexOf + _maxBacktrackPosition; + Ldloc(lastIndexOfLocal); + Ldloc(_maxBacktrackPositionLocal!); + Add(); + Stloc(_runtextposLocal!); + BrFar(_backtrack); + } + } + } + } + MarkLabel(multiCode); Ldc(str.Length); Ldloc(_runtextendLocal!); Ldloc(_runtextposLocal!); @@ -4598,6 +4657,9 @@ private void GenerateOneCode() using RentedLocalBuilder lenLocal = RentInt32Local(); using RentedLocalBuilder iLocal = RentInt32Local(); + using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local(); + Ldloc(_runtextposLocal!); + Stloc(tempMaxBacktrackPositionLocal); if (!IsRightToLeft()) { @@ -4847,6 +4909,12 @@ private void GenerateOneCode() DoPush(); Track(); + // if (_operator == RegexCode.Notoneloop) maxBacktrackPosition = tempMaxBacktrackPosition + if (_regexopcode == RegexCode.Notoneloop) + { + Ldloc(tempMaxBacktrackPositionLocal); + Stloc(_maxBacktrackPositionLocal!); + } } break; } @@ -4870,28 +4938,66 @@ private void GenerateOneCode() //: if (i > 0) //: Track(i - 1, pos - 1); //: Advance(2); - PopTrack(); - Stloc(_runtextposLocal!); + Label noBacktrackPositionBranch = DefineLabel(); PopTrack(); using (RentedLocalBuilder posLocal = RentInt32Local()) { Stloc(posLocal); - Ldloc(posLocal); - Ldc(0); - BleFar(AdvanceLabel()); + PopTrack(); + using (RentedLocalBuilder iBacktrackLocal = RentInt32Local()) + { + Stloc(iBacktrackLocal); + // if (!caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft()) + { + Ldloc(_maxBacktrackPositionLocal!); + Ldc(-1); + Beq(noBacktrackPositionBranch); + Ldloc(posLocal); + Ldloc(_maxBacktrackPositionLocal!); + Ble(noBacktrackPositionBranch); + Ldloc(_runtextposLocal!); + Ldloc(posLocal); + Bge(noBacktrackPositionBranch); + /* + int difference = pos - maxBacktrackPosition; + pos = runtextpos; + i -= difference; + maxBacktrackPosition = -1; + */ + // int difference = pos - maxBacktrackPosition; + Ldloc(iBacktrackLocal); + Ldloc(posLocal); + Ldloc(_maxBacktrackPositionLocal!); + Sub(); + Sub(); + Stloc(iBacktrackLocal); + Ldloc(_runtextposLocal!); + Stloc(posLocal); + Ldc(-1); + Stloc(_maxBacktrackPositionLocal!); + } + + MarkLabel(noBacktrackPositionBranch); + Ldloc(posLocal); + Stloc(_runtextposLocal!); + Ldloc(iBacktrackLocal); + Ldc(0); + BleFar(AdvanceLabel()); + ReadyPushTrack(); + Ldloc(iBacktrackLocal); + } + Ldc(1); + Sub(); + DoPush(); ReadyPushTrack(); - Ldloc(posLocal); + Ldloc(_runtextposLocal!); + Ldc(1); + Sub(IsRightToLeft()); + DoPush(); + Trackagain(); + Advance(); } - Ldc(1); - Sub(); - DoPush(); - ReadyPushTrack(); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(IsRightToLeft()); - DoPush(); - Trackagain(); - Advance(); break; case RegexCode.Onelazy: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index d557ec6c3aa73..3e05926733fe9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -20,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner private int _codepos; private bool _rightToLeft; private bool _caseInsensitive; + private int _maxBacktrackPosition = -1; public RegexInterpreter(RegexCode code, CultureInfo culture) { @@ -223,6 +224,20 @@ private bool MatchString(string str) { if (runtextend - runtextpos < c) { + // If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization + if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + { + // If lastIndexOf is -1, we backtrack to the max extent possible. + runtextpos = _maxBacktrackPosition; + ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktrackPosition); + int lastIndexOf = runtextSpan.LastIndexOf(str); + if (lastIndexOf > -1) + { + // Found the next position to match. Move runtextpos here + runtextpos = _maxBacktrackPosition + lastIndexOf; + } + } + return false; } @@ -1185,6 +1200,7 @@ protected override void Go() int len = Math.Min(Operand(1), Forwardchars()); char ch = (char)Operand(0); int i; + int tempMaxBacktrackPosition = runtextpos; if (!_rightToLeft && !_caseInsensitive) { @@ -1217,6 +1233,7 @@ protected override void Go() if (len > i && _operator == RegexCode.Notoneloop) { TrackPush(len - i - 1, runtextpos - Bump()); + _maxBacktrackPosition = tempMaxBacktrackPosition; } } advance = 2; @@ -1261,6 +1278,16 @@ protected override void Go() { int i = TrackPeek(); int pos = TrackPeek(1); + if (!_caseInsensitive && _maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + { + // The Multi node has bumped us along already + int difference = pos - _maxBacktrackPosition; + Debug.Assert(difference > 0); + pos = runtextpos; + i -= difference; + // We shouldn't be backtracking anymore. + _maxBacktrackPosition = -1; + } runtextpos = pos; if (i > 0) { diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 719f280c91726..723fc68e53bab 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.Linq; +using System.Reflection; using System.Tests; using Microsoft.DotNet.RemoteExecutor; using Xunit; @@ -204,9 +205,11 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "abc", "abc", RegexOptions.None, 0, 3, true, "abc" }; yield return new object[] { "abc", "aBc", RegexOptions.None, 0, 3, false, string.Empty }; yield return new object[] { "abc", "aBc", RegexOptions.IgnoreCase, 0, 3, true, "aBc" }; + yield return new object[] { @"abc.*def", "abcghiDEF", RegexOptions.IgnoreCase, 0, 9, true, "abcghiDEF" }; // Using *, +, ?, {}: Actual - "a+\\.?b*\\.?c{2}" yield return new object[] { @"a+\.?b*\.+c{2}", "ab.cc", RegexOptions.None, 0, 5, true, "ab.cc" }; + yield return new object[] { @"[^a]+\.[^z]+", "zzzzz", RegexOptions.None, 0, 5, false, string.Empty }; // RightToLeft yield return new object[] { @"\s+\d+", "sdf 12sad", RegexOptions.RightToLeft, 0, 9, true, " 12" }; @@ -381,6 +384,62 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D2", options, 0, 2, true, "\u05D0\u05D2" }; yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D4", options, 0, 0, false, "" }; } + + // .* Perf Optimization: Case sensitive + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled }) + { + yield return new object[] { @".*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @"a.*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\nFoo", $"\nFooThis should match", options, 0, 21, true, "\nFoo" }; + yield return new object[] { @".*\nfoo", "\nfooThis should match", options, 4, 17, false, "" }; + + yield return new object[] { @".*\dfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\dFoo", "This1Foo should match", options, 0, 21, true, "This1Foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2Foo match", options, 0, 26, true, "This1foo should 2Foo" }; + yield return new object[] { @".*\dFoo", "This1foo shouldn't 2foo match", options, 0, 29, false, "" }; + yield return new object[] { @".*\dfoo", "This1foo shouldn't 2foo match", options, 24, 5, false, "" }; + + yield return new object[] { @".*\dfoo", "1fooThis1foo should 1foo match", options, 4, 9, true, "This1foo" }; + yield return new object[] { @".*\dfoo", "This shouldn't match 1foo", options, 0, 20, false, "" }; + } + + // .* Perf Optimization: Case insensitive + foreach (RegexOptions options in new[] { RegexOptions.IgnoreCase, RegexOptions.IgnoreCase | RegexOptions.Compiled }) + { + yield return new object[] { @".*\nFoo", "\nfooThis should match", options, 0, 21, true, "\nfoo" }; + yield return new object[] { @".*\dFoo", "This1foo should match", options, 0, 21, true, "This1foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2FoO match", options, 0, 26, true, "This1foo should 2FoO" }; + yield return new object[] { @".*\dFoo", "This1Foo should 2fOo match", options, 0, 26, true, "This1Foo should 2fOo" }; + yield return new object[] { @".*\dfoo", "1fooThis1FOO should 1foo match", options, 4, 9, true, "This1FOO" }; + } + + // .* Perf Optimization: RTL, Case-sensitive + foreach (RegexOptions options in new[] { RegexOptions.None | RegexOptions.RightToLeft, RegexOptions.Compiled | RegexOptions.RightToLeft }) + { + yield return new object[] { @".*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @"a.*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\nFoo", $"This should match\nFoo", options, 0, 21, true, "This should match\nFoo" }; + yield return new object[] { @".*\nfoo", "This should matchfoo\n", options, 4, 13, false, "" }; + + yield return new object[] { @".*\dfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\dFoo", "This1Foo should match", options, 0, 21, true, "This1Foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2Foo match", options, 0, 26, true, "This1foo should 2Foo" }; + yield return new object[] { @".*\dFoo", "This1foo shouldn't 2foo match", options, 0, 29, false, "" }; + yield return new object[] { @".*\dfoo", "This1foo shouldn't 2foo match", options, 19, 0, false, "" }; + + yield return new object[] { @".*\dfoo", "1fooThis2foo should 1foo match", options, 8, 4, true, "2foo" }; + yield return new object[] { @".*\dfoo", "This shouldn't match 1foo", options, 0, 20, false, "" }; + } + + // .* Perf Optimization: RTL, case insensitive + foreach (RegexOptions options in new[] { RegexOptions.IgnoreCase | RegexOptions.RightToLeft, RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.RightToLeft }) + { + yield return new object[] { @".*\nFoo", "\nfooThis should match", options, 0, 21, true, "\nfoo" }; + yield return new object[] { @".*\dFoo", "This1foo should match", options, 0, 21, true, "This1foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2FoO match", options, 0, 26, true, "This1foo should 2FoO" }; + yield return new object[] { @".*\dFoo", "This1Foo should 2fOo match", options, 0, 26, true, "This1Foo should 2fOo" }; + yield return new object[] { @".*\dfoo", "1fooThis2FOO should 1foo match", options, 8, 4, true, "2FOO" }; + } } public static IEnumerable Match_Basic_TestData_NetCore() @@ -617,7 +676,7 @@ public void Match_Timeout_Loop_Throws(string pattern, RegexOptions options) public void Match_Timeout_Repetition_Throws(RegexOptions options) { int repetitionCount = 800_000_000; - var regex = new Regex(@"a\s{" + repetitionCount+ "}", options, TimeSpan.FromSeconds(1)); + var regex = new Regex(@"a\s{" + repetitionCount + "}", options, TimeSpan.FromSeconds(1)); string input = @"a" + new string(' ', repetitionCount) + @"b"; Assert.Throws(() => regex.Match(input)); }