Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
/ corefx Public archive

Adds a new regex option - RegexOptions.AnyNewLine. #41195

Closed
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ public enum RegexOptions
RightToLeft = 64,
ECMAScript = 256,
CultureInvariant = 512,
AnyNewLine = 1024,
}
public abstract partial class RegexRunner
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace System.Text.RegularExpressions
/// </summary>
public partial class Regex : ISerializable
{
internal const int MaxOptionShift = 10;
internal const int MaxOptionShift = 11;

protected internal string pattern; // The string pattern provided
protected internal RegexOptions roptions; // the top-level options from the options string
Expand Down Expand Up @@ -95,7 +95,8 @@ private Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, bool
RegexOptions.IgnoreCase |
RegexOptions.Multiline |
RegexOptions.Compiled |
RegexOptions.CultureInvariant
RegexOptions.CultureInvariant |
RegexOptions.AnyNewLine
#if DEBUG
| RegexOptions.Debug
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ internal sealed class RegexCode
public const int Beginning = 18; // \A
public const int Start = 19; // \G
public const int EndZ = 20; // \Z
public const int End = 21; // \Z
public const int End = 21; // \z

public const int Nothing = 22; // Reject!

Expand Down Expand Up @@ -82,6 +82,9 @@ internal sealed class RegexCode
public const int ECMABoundary = 41; // \b
public const int NonECMABoundary = 42; // \B

public const int AnyEndZ = 43; // \Z
public const int AnyEol = 44; // $

// Modifiers for alternate modes
public const int Mask = 63; // Mask to get unmodified ordinary operator
public const int Rtl = 64; // bit to indicate that we're reverse scanning.
Expand Down Expand Up @@ -160,13 +163,15 @@ public static int OpcodeSize(int opcode)
case Nothing:
case Bol:
case Eol:
case AnyEol:
case Boundary:
case Nonboundary:
case ECMABoundary:
case NonECMABoundary:
case Beginning:
case Start:
case EndZ:
case AnyEndZ:
case End:
case Nullmark:
case Setmark:
Expand Down Expand Up @@ -226,9 +231,8 @@ public static int OpcodeSize(int opcode)
"Nullmark", "Setmark", "Capturemark", "Getmark",
"Setjump", "Backjump", "Forejump", "Testref", "Goto",
"Prune", "Stop",
#if ECMA
"ECMABoundary", "NonECMABoundary",
#endif
"AnyEndZ", "AnyEol",
};

private static string OperatorDescription(int Opcode)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,7 @@ protected void GenerateFindFirstChar()

InitLocalCultureInfo();

if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End)))
if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.AnyEndZ | RegexFCD.End)))
{
if (!_code.RightToLeft)
{
Expand Down Expand Up @@ -1112,7 +1112,7 @@ protected void GenerateFindFirstChar()
MarkLabel(l1);
}

if (0 != (_anchors & RegexFCD.EndZ))
if (0 != (_anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ)))
{
Label l1 = DefineLabel();
Ldthisfld(s_textposF);
Expand Down Expand Up @@ -1182,6 +1182,60 @@ protected void GenerateFindFirstChar()
MarkLabel(l2);
}

if (0 != (_anchors & RegexFCD.AnyEndZ))
{
LocalBuilder diff = _tempV;
Label l1 = DefineLabel();
Label l2 = DefineLabel();
Label l3 = DefineLabel();
Ldthisfld(s_textendF);
Ldthisfld(s_textposF);
Sub();
Stloc(diff);
Ldloc(diff);
Ldc(2);
Bgt(l1);
Ldloc(diff);
Ldc(2);
Blt(l2);
Ldthisfld(s_textF);
Ldthisfld(s_textposF);
Callvirt(s_getcharM);
Ldc((int)'\r');
Bne(l1);
Ldthisfld(s_textF);
Ldthisfld(s_textposF);
Ldc(1);
Add();
Callvirt(s_getcharM);
Ldc((int)'\n');
Bne(l1);
Br(l3);

MarkLabel(l2);
Ldloc(diff);
Ldc(1);
Blt(l3);
Ldthisfld(s_textF);
Ldthisfld(s_textposF);
Callvirt(s_getcharM);
Ldc((int)'\n');
Beq(l3);
Ldthisfld(s_textF);
Ldthisfld(s_textposF);
Callvirt(s_getcharM);
Ldc((int)'\r');
Beq(l3);

MarkLabel(l1);
Ldthis();
Ldthisfld(s_textbegF);
Stfld(s_textposF);
Ldc(0);
Ret();
MarkLabel(l3);
}

if (0 != (_anchors & RegexFCD.Start))
{
Label l1 = DefineLabel();
Expand Down Expand Up @@ -2372,6 +2426,23 @@ private void GenerateOneCode()
break;
}

case RegexCode.AnyEol:
//: if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r')
//: break Backward;
{
Label l1 = _labels[NextCodepos()];
Ldloc(_textposV);
Ldloc(_textendV);
Bge(l1);
Rightchar();
Ldc((int)'\n');
Beq(l1);
Rightchar();
Ldc((int)'\r');
BneFar(_backtrack);
break;
}

case RegexCode.Boundary:
case RegexCode.Nonboundary:
//: if (!IsBoundary(Textpos(), _textbeg, _textend))
Expand Down Expand Up @@ -2434,6 +2505,52 @@ private void GenerateOneCode()
BneFar(_backtrack);
break;

case RegexCode.AnyEndZ:
//: if (rightChars > 2)
//: break Backward;
//: if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n')
//: break Backward;
//: if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n'))
//: break Backward;
{
LocalBuilder diff = _tempV;
Label l1 = DefineLabel();

Ldloc(_textendV);
Ldloc(_textposV);
Sub();
Stloc(diff);
Ldloc(diff);
Ldc(2);
BgtFar(_backtrack);
Ldloc(diff);
Ldc(2);
Blt(l1);
Rightchar();
Ldc((int)'\r');
BneFar(_backtrack);
Ldloc(_textV);
Ldloc(_textposV);
Ldc(1);
Add();
Callvirt(s_getcharM);
Ldc((int)'\n');
BneFar(_backtrack);
Br(_labels[NextCodepos()]);

MarkLabel(l1);
Ldloc(diff);
Ldc(1);
Blt(_labels[NextCodepos()]);
Rightchar();
Ldc((int)'\n');
Beq(_labels[NextCodepos()]);
Rightchar();
Ldc((int)'\r');
BneFar(_backtrack);
break;
}

case RegexCode.End:
//: if (Rightchars() > 0)
//: break Backward;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ internal ref struct RegexFCD
public const int Boundary = 0x0040;
public const int ECMABoundary = 0x0080;

public const int AnyEndZ = 0x0100;
public const int AnyEol = 0x0200;

private readonly List<RegexFC> _fcStack;
private ValueListBuilder<int> _intStack; // must not be readonly
private bool _skipAllChildren; // don't process any more children at the current level
Expand Down Expand Up @@ -125,11 +128,13 @@ public static RegexPrefix Prefix(RegexTree tree)

case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
case RegexNode.Empty:
case RegexNode.Require:
Expand Down Expand Up @@ -180,11 +185,13 @@ public static int Anchors(RegexTree tree)

case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
return result | AnchorFromType(curNode.NType);

Expand Down Expand Up @@ -212,11 +219,13 @@ private static int AnchorFromType(int type) =>
{
RegexNode.Bol => Bol,
RegexNode.Eol => Eol,
RegexNode.AnyEol => AnyEol,
RegexNode.Boundary => Boundary,
RegexNode.ECMABoundary => ECMABoundary,
RegexNode.Beginning => Beginning,
RegexNode.Start => Start,
RegexNode.EndZ => EndZ,
RegexNode.AnyEndZ => AnyEndZ,
RegexNode.End => End,
_ => 0,
};
Expand All @@ -238,10 +247,14 @@ public static string AnchorDescription(int anchors)
sb.Append(", ECMABoundary");
if (0 != (anchors & Eol))
sb.Append(", Eol");
if (0 != (anchors & AnyEol))
sb.Append(", AnyEol");
if (0 != (anchors & End))
sb.Append(", End");
if (0 != (anchors & EndZ))
sb.Append(", EndZ");
if (0 != (anchors & AnyEndZ))
sb.Append(", AnyEndZ");

if (sb.Length >= 2)
return (sb.ToString(2, sb.Length - 2));
Expand Down Expand Up @@ -502,13 +515,15 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
case RegexNode.Nothing:
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
PushFC(new RegexFC(true));
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ private char CharAt(int j)

protected override bool FindFirstChar()
{
if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End)))
if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start |
RegexFCD.EndZ | RegexFCD.AnyEndZ | RegexFCD.End)))
{
if (!_code.RightToLeft)
{
Expand All @@ -365,7 +366,7 @@ protected override bool FindFirstChar()
runtextpos = runtextend;
return false;
}
if (0 != (_code.Anchors & RegexFCD.EndZ) && runtextpos < runtextend - 1)
if (0 != (_code.Anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ)) && runtextpos < runtextend - 1)
{
runtextpos = runtextend - 1;
}
Expand All @@ -379,6 +380,11 @@ protected override bool FindFirstChar()
if ((0 != (_code.Anchors & RegexFCD.End) && runtextpos < runtextend) ||
(0 != (_code.Anchors & RegexFCD.EndZ) && (runtextpos < runtextend - 1 ||
(runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'))) ||
(0 != (_code.Anchors & RegexFCD.AnyEndZ) && (runtextpos < runtextend - 2 ||
(runtextpos == runtextend - 2 && (CharAt(runtextpos) != '\r'
|| CharAt(runtextpos+1) != '\n')) ||
(runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'
&& CharAt(runtextpos) != '\r'))) ||
(0 != (_code.Anchors & RegexFCD.Start) && runtextpos < runtextstart))
{
runtextpos = runtextbeg;
Expand Down Expand Up @@ -824,6 +830,12 @@ protected override void Go()
advance = 0;
continue;

case RegexCode.AnyEol:
if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r')
break;
advance = 0;
continue;

case RegexCode.Boundary:
if (!IsBoundary(Textpos(), runtextbeg, runtextend))
break;
Expand Down Expand Up @@ -866,6 +878,17 @@ protected override void Go()
advance = 0;
continue;

case RegexCode.AnyEndZ:
int rightChars = Rightchars();
if (rightChars > 2)
break;
if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n')
break;
if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n'))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, spaces around +

break;
advance = 0;
continue;

case RegexCode.End:
if (Rightchars() > 0)
break;
Expand Down
Loading