Skip to content

Commit

Permalink
Eliminate backtracking in the interpreter for patterns with .* (dotne…
Browse files Browse the repository at this point in the history
…t#51508)

* First cut of look up table for speeding up Go()

* More efficient .* in RegexInterpreter

* sq

* Get more debug info

* Remove assert and add unit test

* Potential unit test

* temp

* Fix a bug

* sq

* Add extra protection to the backtracking optimization

* Add unit test

* Revert

* RegexCompiler changes

* sq

* Remove debug unit tests

* Add a length to the AsSpan call

* Address RegexCompiler comments and add unit tests
  • Loading branch information
Prashanth Govindarajan authored Jul 18, 2021
1 parent fe49e55 commit 7eb749c
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!;
Expand All @@ -90,6 +91,7 @@ internal abstract class RegexCompiler
private LocalBuilder? _runstackLocal;
private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo
private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop
private LocalBuilder? _maxBacktrackPositionLocal;

protected RegexOptions _options; // options
protected RegexCode? _code; // the RegexCode object
Expand Down Expand Up @@ -891,6 +893,8 @@ private void GenerateForwardSection()
Mvfldloc(s_runtrackposField, _runtrackposLocal!);
Mvfldloc(s_runstackField, _runstackLocal!);
Mvfldloc(s_runstackposField, _runstackposLocal!);
Ldc(-1);
Stloc(_maxBacktrackPositionLocal!);

_backpos = -1;

Expand Down Expand Up @@ -1705,7 +1709,7 @@ protected void GenerateFindFirstChar()
// if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse;
// ...
Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
for (; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
{
Debug.Assert(needLoop);
Ldloca(textSpanLocal);
Expand Down Expand Up @@ -3310,6 +3314,7 @@ protected void GenerateGo()
}
_runtextbegLocal = DeclareInt32();
_runtextendLocal = DeclareInt32();
_maxBacktrackPositionLocal = DeclareInt32();

InitializeCultureForGoIfNecessary();

Expand Down Expand Up @@ -4258,7 +4263,61 @@ private void GenerateOneCode()
//: break Backward;
{
string str = _strings![Operand(0)];
Label multiCode = DefineLabel();
if (!IsRightToLeft())
{
// if (runtextend - runtextpos < c)
Ldloc(_runtextendLocal!);
Ldloc(_runtextposLocal!);
Sub();
Ldc(str.Length);
BgeFar(multiCode);
// if (!caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
if (!IsCaseInsensitive())
{
Ldloc(_maxBacktrackPositionLocal!);
Ldc(-1);
BeqFar(_backtrack);
Ldloc(_runtextposLocal!);
Ldloc(_maxBacktrackPositionLocal!);
BleFar(_backtrack);
// runtextpos = _maxBacktrackPosition;
Ldloc(_maxBacktrackPositionLocal!);
Stloc(_runtextposLocal!);
// ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktractPosition);
Ldloc(_runtextLocal!);
Ldloc(_maxBacktrackPositionLocal!);
Ldloc(_runtextendLocal!);
Ldloc(_maxBacktrackPositionLocal!);
Sub();
using (RentedLocalBuilder runtextSpanLocal = RentReadOnlySpanCharLocal())
{
Call(s_stringAsSpanIntIntMethod);
Stloc(runtextSpanLocal);
using (RentedLocalBuilder lastIndexOfLocal = RentInt32Local())
{
// int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan());
Ldloc(runtextSpanLocal);
Ldstr(str);
Call(s_stringAsSpanMethod);
Call(s_spanLastIndexOfMethod);
Stloc(lastIndexOfLocal);
// if (lastIndexOf > -1)
Ldloc(lastIndexOfLocal);
Ldc(-1);
BleFar(_backtrack);
// runtextpos = lastIndexOf + _maxBacktrackPosition;
Ldloc(lastIndexOfLocal);
Ldloc(_maxBacktrackPositionLocal!);
Add();
Stloc(_runtextposLocal!);
BrFar(_backtrack);
}
}
}
}

MarkLabel(multiCode);
Ldc(str.Length);
Ldloc(_runtextendLocal!);
Ldloc(_runtextposLocal!);
Expand Down Expand Up @@ -4598,6 +4657,9 @@ private void GenerateOneCode()

using RentedLocalBuilder lenLocal = RentInt32Local();
using RentedLocalBuilder iLocal = RentInt32Local();
using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local();
Ldloc(_runtextposLocal!);
Stloc(tempMaxBacktrackPositionLocal);

if (!IsRightToLeft())
{
Expand Down Expand Up @@ -4847,6 +4909,12 @@ private void GenerateOneCode()
DoPush();

Track();
// if (_operator == RegexCode.Notoneloop) maxBacktrackPosition = tempMaxBacktrackPosition
if (_regexopcode == RegexCode.Notoneloop)
{
Ldloc(tempMaxBacktrackPositionLocal);
Stloc(_maxBacktrackPositionLocal!);
}
}
break;
}
Expand All @@ -4870,28 +4938,66 @@ private void GenerateOneCode()
//: if (i > 0)
//: Track(i - 1, pos - 1);
//: Advance(2);
PopTrack();
Stloc(_runtextposLocal!);
Label noBacktrackPositionBranch = DefineLabel();
PopTrack();
using (RentedLocalBuilder posLocal = RentInt32Local())
{
Stloc(posLocal);
Ldloc(posLocal);
Ldc(0);
BleFar(AdvanceLabel());
PopTrack();
using (RentedLocalBuilder iBacktrackLocal = RentInt32Local())
{
Stloc(iBacktrackLocal);
// if (!caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft())
{
Ldloc(_maxBacktrackPositionLocal!);
Ldc(-1);
Beq(noBacktrackPositionBranch);
Ldloc(posLocal);
Ldloc(_maxBacktrackPositionLocal!);
Ble(noBacktrackPositionBranch);
Ldloc(_runtextposLocal!);
Ldloc(posLocal);
Bge(noBacktrackPositionBranch);
/*
int difference = pos - maxBacktrackPosition;
pos = runtextpos;
i -= difference;
maxBacktrackPosition = -1;
*/
// int difference = pos - maxBacktrackPosition;
Ldloc(iBacktrackLocal);
Ldloc(posLocal);
Ldloc(_maxBacktrackPositionLocal!);
Sub();
Sub();
Stloc(iBacktrackLocal);
Ldloc(_runtextposLocal!);
Stloc(posLocal);
Ldc(-1);
Stloc(_maxBacktrackPositionLocal!);
}

MarkLabel(noBacktrackPositionBranch);
Ldloc(posLocal);
Stloc(_runtextposLocal!);
Ldloc(iBacktrackLocal);
Ldc(0);
BleFar(AdvanceLabel());
ReadyPushTrack();
Ldloc(iBacktrackLocal);
}
Ldc(1);
Sub();
DoPush();
ReadyPushTrack();
Ldloc(posLocal);
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
DoPush();
Trackagain();
Advance();
}
Ldc(1);
Sub();
DoPush();
ReadyPushTrack();
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
DoPush();
Trackagain();
Advance();
break;

case RegexCode.Onelazy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner
private int _codepos;
private bool _rightToLeft;
private bool _caseInsensitive;
private int _maxBacktrackPosition = -1;

public RegexInterpreter(RegexCode code, CultureInfo culture)
{
Expand Down Expand Up @@ -223,6 +224,20 @@ private bool MatchString(string str)
{
if (runtextend - runtextpos < c)
{
// If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization
if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
{
// If lastIndexOf is -1, we backtrack to the max extent possible.
runtextpos = _maxBacktrackPosition;
ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktrackPosition);
int lastIndexOf = runtextSpan.LastIndexOf(str);
if (lastIndexOf > -1)
{
// Found the next position to match. Move runtextpos here
runtextpos = _maxBacktrackPosition + lastIndexOf;
}
}

return false;
}

Expand Down Expand Up @@ -1185,6 +1200,7 @@ protected override void Go()
int len = Math.Min(Operand(1), Forwardchars());
char ch = (char)Operand(0);
int i;
int tempMaxBacktrackPosition = runtextpos;

if (!_rightToLeft && !_caseInsensitive)
{
Expand Down Expand Up @@ -1217,6 +1233,7 @@ protected override void Go()
if (len > i && _operator == RegexCode.Notoneloop)
{
TrackPush(len - i - 1, runtextpos - Bump());
_maxBacktrackPosition = tempMaxBacktrackPosition;
}
}
advance = 2;
Expand Down Expand Up @@ -1261,6 +1278,16 @@ protected override void Go()
{
int i = TrackPeek();
int pos = TrackPeek(1);
if (!_caseInsensitive && _maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
{
// The Multi node has bumped us along already
int difference = pos - _maxBacktrackPosition;
Debug.Assert(difference > 0);
pos = runtextpos;
i -= difference;
// We shouldn't be backtracking anymore.
_maxBacktrackPosition = -1;
}
runtextpos = pos;
if (i > 0)
{
Expand Down
Loading

0 comments on commit 7eb749c

Please sign in to comment.