From a867e2385656d1f8f3b8421cc1464447b04528cf Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Mon, 16 Sep 2019 23:15:12 -0700 Subject: [PATCH 1/7] Added new regex option RegexOptions.AnyNewLine, and regex nodes RegexNode.AnyEndZ and RegexNode.AnyEol --- .../ref/System.Text.RegularExpressions.cs | 1 + .../System/Text/RegularExpressions/Regex.cs | 5 +- .../Text/RegularExpressions/RegexCode.cs | 8 ++- .../Text/RegularExpressions/RegexFCD.cs | 15 +++++ .../RegularExpressions/RegexInterpreter.cs | 17 +++++ .../Text/RegularExpressions/RegexNode.cs | 8 ++- .../Text/RegularExpressions/RegexOptions.cs | 1 + .../Text/RegularExpressions/RegexParser.cs | 15 ++++- .../Text/RegularExpressions/RegexWriter.cs | 2 + .../tests/Regex.Ctor.Tests.cs | 4 +- .../tests/Regex.Match.Tests.cs | 64 +++++++++++++++++-- .../tests/Regex.MultipleMatches.Tests.cs | 4 +- 12 files changed, 128 insertions(+), 16 deletions(-) diff --git a/src/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index e0d82c559658..00e389379519 100644 --- a/src/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -236,6 +236,7 @@ public enum RegexOptions RightToLeft = 64, ECMAScript = 256, CultureInvariant = 512, + AnyNewLine = 1024, } public abstract partial class RegexRunner { diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index cbf9e27c01d7..75288e7d1a18 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -24,7 +24,7 @@ namespace System.Text.RegularExpressions /// public partial class Regex : ISerializable { - internal const int MaxOptionShift = 10; + internal const int MaxOptionShift = 11; protected internal string pattern; // The string pattern provided protected internal RegexOptions roptions; // the top-level options from the options string @@ -95,7 +95,8 @@ private Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, bool RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | - RegexOptions.CultureInvariant + RegexOptions.CultureInvariant | + RegexOptions.AnyNewLine #if DEBUG | RegexOptions.Debug #endif diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index b4bda3abf4a0..4112c125f494 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -53,7 +53,7 @@ internal sealed class RegexCode public const int Beginning = 18; // \A public const int Start = 19; // \G public const int EndZ = 20; // \Z - public const int End = 21; // \Z + public const int End = 21; // \z public const int Nothing = 22; // Reject! @@ -82,6 +82,9 @@ internal sealed class RegexCode public const int ECMABoundary = 41; // \b public const int NonECMABoundary = 42; // \B + public const int AnyEndZ = 43; // \Z + public const int AnyEol = 44; // $ + // Modifiers for alternate modes public const int Mask = 63; // Mask to get unmodified ordinary operator public const int Rtl = 64; // bit to indicate that we're reverse scanning. @@ -160,6 +163,7 @@ public static int OpcodeSize(int opcode) case Nothing: case Bol: case Eol: + case AnyEol: case Boundary: case Nonboundary: case ECMABoundary: @@ -167,6 +171,7 @@ public static int OpcodeSize(int opcode) case Beginning: case Start: case EndZ: + case AnyEndZ: case End: case Nullmark: case Setmark: @@ -229,6 +234,7 @@ public static int OpcodeSize(int opcode) #if ECMA "ECMABoundary", "NonECMABoundary", #endif + "AnyEndZ", "AnyEol", // FIXME }; private static string OperatorDescription(int Opcode) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs index e1baf0ec11cd..984c427ca2a8 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs @@ -33,6 +33,9 @@ internal ref struct RegexFCD public const int Boundary = 0x0040; public const int ECMABoundary = 0x0080; + public const int AnyEndZ = 0x0100; + public const int AnyEol = 0x0200; + private readonly List _fcStack; private ValueListBuilder _intStack; // must not be readonly private bool _skipAllChildren; // don't process any more children at the current level @@ -125,11 +128,13 @@ public static RegexPrefix Prefix(RegexTree tree) case RegexNode.Bol: case RegexNode.Eol: + case RegexNode.AnyEol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: + case RegexNode.AnyEndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: @@ -180,11 +185,13 @@ public static int Anchors(RegexTree tree) case RegexNode.Bol: case RegexNode.Eol: + case RegexNode.AnyEol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: + case RegexNode.AnyEndZ: case RegexNode.End: return result | AnchorFromType(curNode.NType); @@ -212,11 +219,13 @@ private static int AnchorFromType(int type) => { RegexNode.Bol => Bol, RegexNode.Eol => Eol, + RegexNode.AnyEol => AnyEol, RegexNode.Boundary => Boundary, RegexNode.ECMABoundary => ECMABoundary, RegexNode.Beginning => Beginning, RegexNode.Start => Start, RegexNode.EndZ => EndZ, + RegexNode.AnyEndZ => AnyEndZ, RegexNode.End => End, _ => 0, }; @@ -238,10 +247,14 @@ public static string AnchorDescription(int anchors) sb.Append(", ECMABoundary"); if (0 != (anchors & Eol)) sb.Append(", Eol"); + if (0 != (anchors & AnyEol)) + sb.Append(", AnyEol"); if (0 != (anchors & End)) sb.Append(", End"); if (0 != (anchors & EndZ)) sb.Append(", EndZ"); + if (0 != (anchors & AnyEndZ)) + sb.Append(", AnyEndZ"); if (sb.Length >= 2) return (sb.ToString(2, sb.Length - 2)); @@ -502,6 +515,7 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: + case RegexNode.AnyEol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: @@ -509,6 +523,7 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: + case RegexNode.AnyEndZ: case RegexNode.End: PushFC(new RegexFC(true)); break; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 08da6002de5f..6cdb61b4d445 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -824,6 +824,12 @@ protected override void Go() advance = 0; continue; + case RegexCode.AnyEol: + if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r') + break; + advance = 0; + continue; + case RegexCode.Boundary: if (!IsBoundary(Textpos(), runtextbeg, runtextend)) break; @@ -866,6 +872,17 @@ protected override void Go() advance = 0; continue; + case RegexCode.AnyEndZ: + int rightChars = Rightchars(); + if (rightChars > 2) + break; + if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n') + break; + if (rightChars == 2 && CharAt(Textpos()) != '\r' && CharAt(Textpos()+1) != '\n') + break; + advance = 0; + continue; + case RegexCode.End: if (Rightchars() > 0) break; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 357a4d3ad13b..624ef89021c6 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -68,6 +68,7 @@ internal sealed class RegexNode public const int Bol = RegexCode.Bol; // ^ public const int Eol = RegexCode.Eol; // $ + public const int AnyEol = RegexCode.AnyEol; // $ public const int Boundary = RegexCode.Boundary; // \b public const int Nonboundary = RegexCode.Nonboundary; // \B public const int ECMABoundary = RegexCode.ECMABoundary; // \b @@ -75,6 +76,7 @@ internal sealed class RegexNode public const int Beginning = RegexCode.Beginning; // \A public const int Start = RegexCode.Start; // \G public const int EndZ = RegexCode.EndZ; // \Z + public const int AnyEndZ = RegexCode.AnyEndZ; // \Z public const int End = RegexCode.End; // \z // Interior nodes do not correspond to primitive operations, but @@ -564,9 +566,9 @@ public int Type() "Onelazy", "Notonelazy", "Setlazy", "One", "Notone", "Set", "Multi", "Ref", - "Bol", "Eol", "Boundary", "Nonboundary", + "Bol", "Eol", "AnyEol", "Boundary", "Nonboundary", "ECMABoundary", "NonECMABoundary", - "Beginning", "Start", "EndZ", "End", + "Beginning", "Start", "EndZ", "AnyEndZ", "End", "Nothing", "Empty", "Alternate", "Concatenate", "Loop", "Lazyloop", @@ -593,6 +595,8 @@ private string Description() ArgSb.Append("-X"); if ((Options & RegexOptions.ECMAScript) != 0) ArgSb.Append("-E"); + if ((Options & RegexOptions.AnyNewLine) != 0) + ArgSb.Append("-A"); switch (NType) { diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs index e3a3087bbec6..0dec646370f3 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs @@ -22,5 +22,6 @@ public enum RegexOptions ECMAScript = 0x0100, // "e" CultureInvariant = 0x0200, + AnyNewLine = 0x0400, // "a", Treat "$" as (?=[\r\n]|\z) } } diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 3b22b770b76a..cb72dd6f7fcb 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -360,7 +360,10 @@ private RegexNode ScanRegex() break; case '$': - AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ); + if (UseOptionA()) + AddUnitType(UseOptionM() ? RegexNode.AnyEol : RegexNode.AnyEndZ); + else + AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ); break; case '.': @@ -1627,6 +1630,7 @@ private static RegexOptions OptionFromCode(char ch) 'd' => RegexOptions.Debug, #endif 'e' => RegexOptions.ECMAScript, + 'a' => RegexOptions.AnyNewLine, _ => 0, }; } @@ -1939,6 +1943,15 @@ private bool UseOptionE() return (_options & RegexOptions.ECMAScript) != 0; } + /* + * True if A option altering meaning of $ to match both Windows' + * Environment.NewLine and UNIX' Environment.NewLine is on. + */ + private bool UseOptionA() + { + return (_options & RegexOptions.AnyNewLine) != 0; + } + private const byte Q = 5; // quantifier private const byte S = 4; // ordinary stopper private const byte Z = 3; // ScanBlank stopper diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index eda301d91a58..43a950a2b256 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -480,6 +480,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: + case RegexNode.AnyEol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: @@ -487,6 +488,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: + case RegexNode.AnyEndZ: case RegexNode.End: Emit(node.NType); break; diff --git a/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs index ff7c0c4d9309..1e9bb5b6b39d 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs @@ -63,8 +63,8 @@ public static void Ctor_Invalid() AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)(-1))); AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)(-1), new TimeSpan())); - AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)0x400)); - AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)0x400, new TimeSpan())); + AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)0x800)); + AssertExtensions.Throws("options", () => new Regex("foo", (RegexOptions)0x800, new TimeSpan())); AssertExtensions.Throws("options", () => new Regex("foo", RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.RightToLeft)); AssertExtensions.Throws("options", () => new Regex("foo", RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.ExplicitCapture)); diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 37e35782dd48..440f2c9ff45a 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -599,7 +599,37 @@ public static IEnumerable Match_Advanced_TestData() } }; - // Mutliline + // AnyEndZ (with '\n' used as line ending) + yield return new object[] + { + "line3\nline4$", "line1\nline2\nline3\nline4\n", RegexOptions.AnyNewLine, 0, 24, + new CaptureData[] + { + new CaptureData("line3\nline4", 12, 11) + } + }; + + // AnyEndZ (with '\r' used as line ending) + yield return new object[] + { + "line3\nline4$", "line1\nline2\nline3\nline4\r", RegexOptions.AnyNewLine, 0, 24, + new CaptureData[] + { + new CaptureData("line3\nline4", 12, 11) + } + }; + + // AnyEndZ (with '\r\n' used as line ending) + yield return new object[] + { + "line3\nline4$", "line1\nline2\nline3\nline4\r\n", RegexOptions.AnyNewLine, 0, 25, + new CaptureData[] + { + new CaptureData("line3\nline4", 12, 11) + } + }; + + // Multiline yield return new object[] { "(line2$\n)line3", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline, 0, 24, @@ -610,7 +640,7 @@ public static IEnumerable Match_Advanced_TestData() } }; - // Mutliline + // Multiline yield return new object[] { "(line2\n^)line3", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline, 0, 24, @@ -621,10 +651,10 @@ public static IEnumerable Match_Advanced_TestData() } }; - // Mutliline + // Multiline (with '\n' used as line ending) yield return new object[] { - "(line3\n$\n)line4", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline, 0, 24, + "(line3\n$\n)line4", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 24, new CaptureData[] { new CaptureData("line3\n\nline4", 12, 12), @@ -632,7 +662,29 @@ public static IEnumerable Match_Advanced_TestData() } }; - // Mutliline + // Multiline (with '\r\n' used as line ending) + yield return new object[] + { + "(line3$\r\n)line4", "line1\nline2\nline3\r\nline4", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 24, + new CaptureData[] + { + new CaptureData("line3\r\nline4", 12, 12), + new CaptureData("line3\r\n", 12, 7) + } + }; + + // Multiline (with '\r' used as line ending) + yield return new object[] + { + "(line3$\r)line4", "line1\nline2\nline3\rline4", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 23, + new CaptureData[] + { + new CaptureData("line3\rline4", 12, 11), + new CaptureData("line3\r", 12, 6) + } + }; + + // Multiline yield return new object[] { "(line3\n^\n)line4", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline, 0, 24, @@ -643,7 +695,7 @@ public static IEnumerable Match_Advanced_TestData() } }; - // Mutliline + // Multiline yield return new object[] { "(line2$\n^)line3", "line1\nline2\nline3\n\nline4", RegexOptions.Multiline, 0, 24, diff --git a/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs index 743404048b61..0fa776436b89 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs @@ -217,8 +217,8 @@ public void Matches_Invalid() // Options are invalid AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)(-1))); AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)(-1), TimeSpan.FromSeconds(1))); - AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)0x400)); - AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)0x400, TimeSpan.FromSeconds(1))); + AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)0x800)); + AssertExtensions.Throws("options", () => Regex.Matches("input", "pattern", (RegexOptions)0x800, TimeSpan.FromSeconds(1))); // MatchTimeout is invalid AssertExtensions.Throws("matchTimeout", () => Regex.Matches("input", "pattern", RegexOptions.None, TimeSpan.Zero)); From 64817bc25a27acf5d04d84ee0c5a72eca18de70b Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Wed, 18 Sep 2019 16:10:15 -0700 Subject: [PATCH 2/7] Added IL code for new options. --- .../Text/RegularExpressions/RegexCompiler.cs | 72 +++++++++++++++++++ .../RegularExpressions/RegexInterpreter.cs | 4 +- .../tests/Regex.Ctor.Tests.cs | 2 +- .../tests/Regex.Match.Tests.cs | 24 +++++++ .../tests/Regex.MultipleMatches.Tests.cs | 29 ++++++++ 5 files changed, 128 insertions(+), 3 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index f81e09732089..4cdbcfac3bb7 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2372,6 +2372,27 @@ private void GenerateOneCode() break; } + case RegexCode.AnyEol: + //: if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r') + //: break Backward; + { + Label l1 = _labels[NextCodepos()]; + Label l2 = DefineLabel(); + Ldloc(_textposV); + Ldloc(_textendV); + Bge(l1); + Rightchar(); + Ldc((int)'\n'); + BneFar(l2); + Br(l1); // FIXME why do we need a branch here + + MarkLabel(l2); + Rightchar(); + Ldc((int)'\r'); + BneFar(_backtrack); + break; + } + case RegexCode.Boundary: case RegexCode.Nonboundary: //: if (!IsBoundary(Textpos(), _textbeg, _textend)) @@ -2434,6 +2455,57 @@ private void GenerateOneCode() BneFar(_backtrack); break; + case RegexCode.AnyEndZ: + //: if (rightChars > 2) + //: break Backward; + //: if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n') + //: break Backward; + //: if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n')) + //: break Backward; + { + LocalBuilder diff = _tempV; + Label l1 = DefineLabel(); + Label l2 = DefineLabel(); + + Ldloc(_textposV); + Ldloc(_textendV); + Bge(_labels[NextCodepos()]); + + Ldloc(_textendV); + Ldloc(_textposV); + Sub(); + Stloc(diff); + Ldloc(diff); + Ldc(2); + BgtFar(_backtrack); + + Ldloc(diff); + Ldc(1); + BeqFar(l1); + Rightchar(); + Ldc((int)'\r'); + BneFar(_backtrack); + Ldloc(_textV); + Ldloc(_textposV); + Ldc(1); + Add(); + Callvirt(s_getcharM); + Ldc((int)'\n'); + BneFar(_backtrack); // FIXME why do we not need a branch here + + MarkLabel(l1); + Rightchar(); + Ldc((int)'\n'); + BneFar(l2); + Br(_labels[NextCodepos()]); + + MarkLabel(l2); + Rightchar(); + Ldc((int)'\r'); + BneFar(_backtrack); + break; + } + case RegexCode.End: //: if (Rightchars() > 0) //: break Backward; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 6cdb61b4d445..9258efc0eddd 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -353,7 +353,7 @@ private char CharAt(int j) return runtext[j]; } - protected override bool FindFirstChar() + protected override bool FindFirstChar() // FIXME handle anyendz { if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) { @@ -878,7 +878,7 @@ protected override void Go() break; if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n') break; - if (rightChars == 2 && CharAt(Textpos()) != '\r' && CharAt(Textpos()+1) != '\n') + if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n')) break; advance = 0; continue; diff --git a/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs index 1e9bb5b6b39d..c0ac0efe61ff 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Ctor.Tests.cs @@ -18,7 +18,7 @@ public static IEnumerable Ctor_TestData() yield return new object[] { "foo", RegexOptions.None, Timeout.InfiniteTimeSpan }; yield return new object[] { "foo", RegexOptions.RightToLeft, Timeout.InfiniteTimeSpan }; yield return new object[] { "foo", RegexOptions.Compiled, Timeout.InfiniteTimeSpan }; - yield return new object[] { "foo", RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant, Timeout.InfiniteTimeSpan }; + yield return new object[] { "foo", RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.AnyNewLine | RegexOptions.Multiline | RegexOptions.CultureInvariant, Timeout.InfiniteTimeSpan }; yield return new object[] { "foo", RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.Compiled, Timeout.InfiniteTimeSpan }; yield return new object[] { "foo", RegexOptions.None, new TimeSpan(1) }; yield return new object[] { "foo", RegexOptions.None, TimeSpan.FromMilliseconds(int.MaxValue - 1) }; diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 440f2c9ff45a..8bb7ad50aeb7 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -298,6 +298,30 @@ public static IEnumerable Match_Basic_TestData() // Surrogate pairs splitted up into UTF-16 code units. yield return new object[] { @"(\uD82F[\uDCA0-\uDCA3])", "\uD82F\uDCA2", RegexOptions.CultureInvariant, 0, 2, true, "\uD82F\uDCA2" }; + + // AnyNewLine (with none of the special characters used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4", RegexOptions.AnyNewLine, 0, 23, true, "line3\nline4" }; + + // AnyNewLine (with '\n' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\n", RegexOptions.AnyNewLine, 0, 24, true, "line3\nline4" }; + + // AnyNewLine (with '\r' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r", RegexOptions.AnyNewLine, 0, 24, true, "line3\nline4" }; + + // AnyNewLine (with '\r\n' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r\n", RegexOptions.AnyNewLine, 0, 25, true, "line3\nline4" }; + + // AnyNewLine | Multiline (with none of the special characters used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 23, true, "line3\nline4" }; + + // AnyNewLine | Multiline (with '\n' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\n", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 24, true, "line3\nline4" }; + + // AnyNewLine | Multiline (with '\r' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 24, true, "line3\nline4" }; + + // AnyNewLine | Multiline (with '\r\n' used as line ending) + yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r\n", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 25, true, "line3\nline4" }; } [Theory] diff --git a/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs index 0fa776436b89..3ec504534835 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.MultipleMatches.Tests.cs @@ -140,6 +140,35 @@ public static IEnumerable Matches_TestData() new CaptureData("C789", 10, 4), } }; + + yield return new object[] + { + "^line3$\nline4", "line1\nline2\nline3\nline4\nline3\nline4\n", RegexOptions.Multiline, + new CaptureData[] + { + new CaptureData("line3\nline4", 12, 11), + new CaptureData("line3\nline4", 24, 11), + } + }; + + yield return new object[] + { + "^line3$", "line1\nline2\nline3\r\nline4\nline3\nline4\n", RegexOptions.Multiline | RegexOptions.AnyNewLine, + new CaptureData[] + { + new CaptureData("line3", 12, 5), + new CaptureData("line3", 25, 5), + } + }; + + yield return new object[] + { + "line3$", "line1\nline2\nline3\r\nline4\nline3\r", RegexOptions.AnyNewLine, + new CaptureData[] + { + new CaptureData("line3", 25, 5), + } + }; } [Theory] From a73a3a90a8945fcff049bfb73f127783fc61dd04 Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Wed, 18 Sep 2019 21:32:05 -0700 Subject: [PATCH 3/7] Added required changes to FindFirstChar() function in Interpreter and Compiler, and minor other fixes. --- .../Text/RegularExpressions/RegexCompiler.cs | 80 +++++++++++++++---- .../RegularExpressions/RegexInterpreter.cs | 12 ++- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 4cdbcfac3bb7..d0d71ad7db54 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1080,7 +1080,7 @@ protected void GenerateFindFirstChar() InitLocalCultureInfo(); - if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) + if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.AnyEndZ | RegexFCD.End))) { if (!_code.RightToLeft) { @@ -1112,7 +1112,7 @@ protected void GenerateFindFirstChar() MarkLabel(l1); } - if (0 != (_anchors & RegexFCD.EndZ)) + if (0 != (_anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ))) { Label l1 = DefineLabel(); Ldthisfld(s_textposF); @@ -1182,6 +1182,57 @@ protected void GenerateFindFirstChar() MarkLabel(l2); } + if (0 != (_anchors & RegexFCD.AnyEndZ)) + { + LocalBuilder diff = _tempV; + Label l1 = DefineLabel(); + Label l2 = DefineLabel(); + Label l3 = DefineLabel(); + Ldthisfld(s_textendF); + Ldthisfld(s_textposF); + Sub(); + Stloc(diff); + Ldloc(diff); + Ldc(2); + Bgt(l1); + Ldloc(diff); + Ldc(2); + Blt(l2); + Ldthisfld(s_textF); + Ldthisfld(s_textposF); + Callvirt(s_getcharM); + Ldc((int)'\r'); + Bne(l1); + Ldthisfld(s_textF); + Ldthisfld(s_textposF); + Ldc(1); + Add(); + Callvirt(s_getcharM); + Ldc((int)'\n'); + Bne(l1); + Br(l3); + + MarkLabel(l2); + Ldthisfld(s_textF); + Ldthisfld(s_textposF); + Callvirt(s_getcharM); + Ldc((int)'\n'); + Beq(l3); + Ldthisfld(s_textF); + Ldthisfld(s_textposF); + Callvirt(s_getcharM); + Ldc((int)'\r'); + Beq(l3); + + MarkLabel(l1); + Ldthis(); + Ldthisfld(s_textbegF); + Stfld(s_textposF); + Ldc(0); + Ret(); + MarkLabel(l3); + } + if (0 != (_anchors & RegexFCD.Start)) { Label l1 = DefineLabel(); @@ -2383,8 +2434,8 @@ private void GenerateOneCode() Bge(l1); Rightchar(); Ldc((int)'\n'); - BneFar(l2); - Br(l1); // FIXME why do we need a branch here + Bne(l2); + Br(l1); MarkLabel(l2); Rightchar(); @@ -2465,11 +2516,6 @@ private void GenerateOneCode() { LocalBuilder diff = _tempV; Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - - Ldloc(_textposV); - Ldloc(_textendV); - Bge(_labels[NextCodepos()]); Ldloc(_textendV); Ldloc(_textposV); @@ -2478,10 +2524,9 @@ private void GenerateOneCode() Ldloc(diff); Ldc(2); BgtFar(_backtrack); - Ldloc(diff); - Ldc(1); - BeqFar(l1); + Ldc(2); + Blt(l1); Rightchar(); Ldc((int)'\r'); BneFar(_backtrack); @@ -2491,15 +2536,16 @@ private void GenerateOneCode() Add(); Callvirt(s_getcharM); Ldc((int)'\n'); - BneFar(_backtrack); // FIXME why do we not need a branch here + BneFar(_backtrack); + Br(_labels[NextCodepos()]); MarkLabel(l1); + Ldloc(diff); + Ldc(1); + Blt(_labels[NextCodepos()]); Rightchar(); Ldc((int)'\n'); - BneFar(l2); - Br(_labels[NextCodepos()]); - - MarkLabel(l2); + Beq(_labels[NextCodepos()]); Rightchar(); Ldc((int)'\r'); BneFar(_backtrack); diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 9258efc0eddd..6140a96903f0 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -353,9 +353,10 @@ private char CharAt(int j) return runtext[j]; } - protected override bool FindFirstChar() // FIXME handle anyendz + protected override bool FindFirstChar() { - if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) + if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | + RegexFCD.EndZ | RegexFCD.AnyEndZ | RegexFCD.End))) { if (!_code.RightToLeft) { @@ -365,7 +366,7 @@ protected override bool FindFirstChar() // FIXME handle anyendz runtextpos = runtextend; return false; } - if (0 != (_code.Anchors & RegexFCD.EndZ) && runtextpos < runtextend - 1) + if (0 != (_code.Anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ)) && runtextpos < runtextend - 1) { runtextpos = runtextend - 1; } @@ -379,6 +380,11 @@ protected override bool FindFirstChar() // FIXME handle anyendz if ((0 != (_code.Anchors & RegexFCD.End) && runtextpos < runtextend) || (0 != (_code.Anchors & RegexFCD.EndZ) && (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'))) || + (0 != (_code.Anchors & RegexFCD.AnyEndZ) && (runtextpos < runtextend - 2 || + (runtextpos == runtextend - 2 && (CharAt(runtextpos) != '\r' + || CharAt(runtextpos+1) != '\n')) || + (runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n' + && CharAt(runtextpos) != '\r'))) || (0 != (_code.Anchors & RegexFCD.Start) && runtextpos < runtextstart)) { runtextpos = runtextbeg; From d821593bf1345b9d268badb466f59d281fe5191a Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Wed, 18 Sep 2019 23:20:14 -0700 Subject: [PATCH 4/7] Removed ECMA debug variable check --- .../src/System/Text/RegularExpressions/RegexCode.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index 4112c125f494..1c05e4000eb5 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -231,10 +231,8 @@ public static int OpcodeSize(int opcode) "Nullmark", "Setmark", "Capturemark", "Getmark", "Setjump", "Backjump", "Forejump", "Testref", "Goto", "Prune", "Stop", -#if ECMA "ECMABoundary", "NonECMABoundary", -#endif - "AnyEndZ", "AnyEol", // FIXME + "AnyEndZ", "AnyEol", }; private static string OperatorDescription(int Opcode) From 4906d651e3dac604450c0bfd2341a893e166fbfe Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Wed, 18 Sep 2019 23:31:35 -0700 Subject: [PATCH 5/7] minor --- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index d0d71ad7db54..1c3c3853f0b0 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2428,16 +2428,12 @@ private void GenerateOneCode() //: break Backward; { Label l1 = _labels[NextCodepos()]; - Label l2 = DefineLabel(); Ldloc(_textposV); Ldloc(_textendV); Bge(l1); Rightchar(); Ldc((int)'\n'); - Bne(l2); - Br(l1); - - MarkLabel(l2); + Beq(l1); Rightchar(); Ldc((int)'\r'); BneFar(_backtrack); From 092c7c257949a09c211edeeaeed8216ca135aa7e Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Wed, 18 Sep 2019 23:59:52 -0700 Subject: [PATCH 6/7] Added tests to test FindFirstChar. --- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 3 +++ .../tests/Regex.Match.Tests.cs | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 1c3c3853f0b0..47b3edece178 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1213,6 +1213,9 @@ protected void GenerateFindFirstChar() Br(l3); MarkLabel(l2); + Ldloc(diff); + Ldc(1); + Blt(l3); Ldthisfld(s_textF); Ldthisfld(s_textposF); Callvirt(s_getcharM); diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 8bb7ad50aeb7..6ac6e59a1a99 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -322,6 +322,12 @@ public static IEnumerable Match_Basic_TestData() // AnyNewLine | Multiline (with '\r\n' used as line ending) yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r\n", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 25, true, "line3\nline4" }; + + // AnyNewLine (tests FindFirstChar()) + yield return new object[] { @"$", "line1\nline2\nline3\nline4\r\n", RegexOptions.AnyNewLine, 0, 25, true, "" }; + + // AnyNewLine | RightToLeft (tests FindFirstChar()) + yield return new object[] { @"$", "line1\nline2\nline3\nline4\r\n", RegexOptions.RightToLeft | RegexOptions.AnyNewLine, 0, 25, true, "" }; } [Theory] From 93261f8b3b5ce59cce48adca520688ee754ffcff Mon Sep 17 00:00:00 2001 From: shishirchawla Date: Sun, 10 Nov 2019 21:58:38 -0800 Subject: [PATCH 7/7] Added special handling for '.' for anynewline and related test case. --- .../Text/RegularExpressions/RegexInterpreter.cs | 2 +- .../System/Text/RegularExpressions/RegexOptions.cs | 2 +- .../System/Text/RegularExpressions/RegexParser.cs | 14 +++++++++++++- .../tests/Regex.Match.Tests.cs | 7 +++++-- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 6140a96903f0..93eb1d1fc477 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -884,7 +884,7 @@ protected override void Go() break; if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n') break; - if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n')) + if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos() + 1) != '\n')) break; advance = 0; continue; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs index 0dec646370f3..3e22bf85e4e7 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs @@ -22,6 +22,6 @@ public enum RegexOptions ECMAScript = 0x0100, // "e" CultureInvariant = 0x0200, - AnyNewLine = 0x0400, // "a", Treat "$" as (?=[\r\n]|\z) + AnyNewLine = 0x0400, // "a", Treat "$" as (?=\r\z|\n\z|\r\n\z|\z) } } diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index cb72dd6f7fcb..1817033de33b 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -370,7 +370,19 @@ private RegexNode ScanRegex() if (UseOptionS()) AddUnitSet(RegexCharClass.AnyClass); else - AddUnitNotone('\n'); + { + if (UseOptionA()) + { + // Allow everything from RegexCharClass.AnyClass except '\r' and '\n' + RegexCharClass anyClass = RegexCharClass.Parse(RegexCharClass.AnyClass); + RegexCharClass lecc = new RegexCharClass(); // line ending character class + lecc.AddChar('\r'); + lecc.AddChar('\n'); + AddUnitSet(anyClass.AddSubtraction(lecc).ToStringClass()); + } + else + AddUnitNotone('\n'); + } break; case '{': diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 6ac6e59a1a99..231def478656 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -323,11 +323,14 @@ public static IEnumerable Match_Basic_TestData() // AnyNewLine | Multiline (with '\r\n' used as line ending) yield return new object[] { @"line3\nline4$", "line1\nline2\nline3\nline4\r\n", RegexOptions.Multiline | RegexOptions.AnyNewLine, 0, 25, true, "line3\nline4" }; - // AnyNewLine (tests FindFirstChar()) + // AnyNewLine yield return new object[] { @"$", "line1\nline2\nline3\nline4\r\n", RegexOptions.AnyNewLine, 0, 25, true, "" }; - // AnyNewLine | RightToLeft (tests FindFirstChar()) + // AnyNewLine | RightToLeft yield return new object[] { @"$", "line1\nline2\nline3\nline4\r\n", RegexOptions.RightToLeft | RegexOptions.AnyNewLine, 0, 25, true, "" }; + + // AnyNewLine | Multiline ('.' will match everything except \r and \n) + yield return new object[] { @".*$", "foo\r\nbar", RegexOptions.AnyNewLine | RegexOptions.Multiline, 0, 8, true, "foo" }; } [Theory]