Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A little more regex source generator tweaking #62605

Merged
merged 7 commits into from
Dec 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ public partial class RegexGenerator
"#nullable enable",
"#pragma warning disable CS0162 // Unreachable code",
"#pragma warning disable CS0164 // Unreferenced label",
"#pragma warning disable CS0168 // Variable declared but never used",
"#pragma warning disable CS0219 // Variable assigned but never used",
"",
};
Expand Down Expand Up @@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
bool hasTextInfo = false;

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
// built up while generating code.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
var additionalDeclarations = new HashSet<string>();

// Emit locals initialization
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
joperezr marked this conversation as resolved.
Show resolved Hide resolved
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.Flush();
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
Expand Down Expand Up @@ -315,15 +312,17 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
{
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix);
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix);
break;

case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
EmitFixedSet_LeftToRight();
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitFixedSet();
break;

default:
Expand All @@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
}
writer.WriteLine();

writer.WriteLine("// No match");
writer.WriteLine("// No starting position found");
writer.WriteLine("ReturnFalse:");
writer.WriteLine("base.runtextpos = end;");
writer.WriteLine("return false;");
Expand Down Expand Up @@ -368,8 +367,7 @@ bool EmitAnchors()

case RegexPrefixAnalyzer.Start:
writer.WriteLine("// Start \\G anchor");
additionalDeclarations.Add("int start = base.runtextstart;");
using (EmitBlock(writer, "if (pos > start)"))
using (EmitBlock(writer, "if (pos > base.runtextstart)"))
{
writer.WriteLine("goto ReturnFalse;");
}
Expand Down Expand Up @@ -400,6 +398,7 @@ bool EmitAnchors()
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
writer.WriteLine("// Beginning-of-line anchor");
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
additionalDeclarations.Add("int beginning = base.runtextbeg;");
using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')"))
{
Expand All @@ -418,8 +417,8 @@ bool EmitAnchors()
return false;
}

// Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
void EmitIndexOf_LeftToRight(string prefix)
// Emits a case-sensitive prefix search for a string at the beginning of the pattern.
void EmitIndexOf(string prefix)
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), {Literal(prefix)});");
writer.WriteLine("if (i >= 0)");
Expand All @@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix)
writer.WriteLine("}");
}

// Emits a left-to-right search for a set at a fixed position from the start of the pattern,
// Emits a search for a set at a fixed position from the start of the pattern,
// and potentially other sets at other fixed positions in the pattern.
void EmitFixedSet_LeftToRight()
void EmitFixedSet()
{
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets;
(char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
Expand Down Expand Up @@ -600,15 +599,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)

RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
bool hasTimeout = false;

// Helper to define names. Names start unadorned, but as soon as there's repetition,
// they begin to have a numbered suffix.
var usedNames = new Dictionary<string, int>();

// Every RegexTree is rooted in the implicit Capture for the whole expression.
// Skip the Capture node. We handle the implicit root capture specially.
RegexNode node = rm.Code.Tree.Root;
RegexNode node = code.Tree.Root;
Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node");
Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child");
node = node.Child(0);
Expand All @@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
}

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
// built up while generating code.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
var additionalDeclarations = new HashSet<string>();
var additionalLocalFunctions = new Dictionary<string, string[]>();

Expand All @@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.WriteLine($"int original_pos = pos;");
hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);
writer.Flush();
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
int additionalDeclarationsIndent = writer.Indent;
writer.WriteLine();

// TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);

// The implementation tries to use const indexes into the span wherever possible, which we can do
// for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
Expand Down Expand Up @@ -936,8 +930,7 @@ void EmitAllBranches()

// Save off pos. We'll need to reset this each time a branch fails.
string startingPos = ReserveName("alternation_starting_pos");
additionalDeclarations.Add($"int {startingPos} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"int {startingPos} = pos;");
int startingSliceStaticPos = sliceStaticPos;

// We need to be able to undo captures in two situations:
Expand All @@ -964,8 +957,7 @@ void EmitAllBranches()
if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic))
{
startingCapturePos = ReserveName("alternation_starting_capturepos");
additionalDeclarations.Add($"int {startingCapturePos} = 0;");
writer.WriteLine($"{startingCapturePos} = base.Crawlpos();");
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
}
writer.WriteLine();

Expand Down Expand Up @@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node)
// to backtrack to. So, we expose a single Backtrack label and track which branch was
// followed in this resumeAt local.
string resumeAt = ReserveName("conditionalbackreference_branch");
additionalDeclarations.Add($"int {resumeAt} = 0;");
writer.WriteLine($"int {resumeAt} = 0;");

// While it would be nicely readable to use an if/else block, if the branches contain
// anything that triggers backtracking, labels will end up being defined, and if they're
Expand Down Expand Up @@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node)
{
startingCapturePos = ReserveName("conditionalexpression_starting_capturepos");
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
writer.WriteLine();
}

string resumeAt = ReserveName("conditionalexpression_resumeAt");
if (!isAtomic)
{
writer.WriteLine($"int {resumeAt} = 0;");
}

// Emit the conditional expression. We need to reroute any match failures to either the "no" branch
Expand All @@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node)
{
doneLabel = originalDoneLabel;
}

string postConditionalDoneLabel = doneLabel;
string resumeAt = ReserveName("conditionalexpression_resumeAt");
if (!isAtomic)
{
additionalDeclarations.Add($"int {resumeAt} = 0;");
}

// If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
// Since the "yes" branch may have a different execution path than the "no" branch or the lack of
Expand All @@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node)
writer.WriteLine();
TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
string postYesDoneLabel = doneLabel;
if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
writer.WriteLine($"{resumeAt} = 0;");
}
if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
{
writer.WriteLine($"goto {end};");
Expand Down Expand Up @@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)

TransferSliceStaticPosToPos();
string startingPos = ReserveName("capture_starting_pos");
additionalDeclarations.Add($"int {startingPos} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"int {startingPos} = pos;");
writer.WriteLine();

RegexNode child = node.Child(0);
Expand Down Expand Up @@ -1604,7 +1590,8 @@ node.Type is RegexNode.Atomic or // atomic nodes by definition don't give up any
RegexNode.Oneloopatomic or RegexNode.Notoneloopatomic or RegexNode.Setloopatomic or // same for atomic loops
RegexNode.One or RegexNode.Notone or RegexNode.Set or // individual characters don't backtrack
RegexNode.Multi or // multiple characters don't backtrack
RegexNode.Beginning or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
RegexNode.Ref or // backreferences don't backtrack
RegexNode.Beginning or RegexNode.Bol or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Eol or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
RegexNode.Nothing or RegexNode.Empty or RegexNode.UpdateBumpalong // empty/nothing don't do anything
// Fixed-size repeaters of single characters or atomic don't backtrack
|| node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop or RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy && node.M == node.N
Expand Down Expand Up @@ -1965,26 +1952,30 @@ void EmitAnchors(RegexNode node)
break;

case RegexNode.End:
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos})"))
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()})"))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

case RegexNode.EndZ:
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
using (EmitBlock(writer, null))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

case RegexNode.Eol:
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

string IsSliceLengthGreaterThanSliceStaticPos() =>
sliceStaticPos == 0 ? $"!{sliceSpan}.IsEmpty" :
$"{sliceSpan}.Length > {sliceStaticPos}";
}
}

Expand Down Expand Up @@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
maxIterations = $"{node.N - node.M}";

iterationCount = ReserveName("lazyloop_iteration");
additionalDeclarations.Add($"int {iterationCount} = 0;");
writer.WriteLine($"{iterationCount} = 0;");
writer.WriteLine($"int {iterationCount} = 0;");
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
Expand Down Expand Up @@ -2366,18 +2356,15 @@ void EmitLazy(RegexNode node)
string body = ReserveName("LazyLoopBody");
string endLoop = ReserveName("LazyLoopEnd");

additionalDeclarations.Add($"int {iterationCount} = 0, {startingPos} = 0, {sawEmpty} = 0;");
writer.WriteLine($"{iterationCount} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"{sawEmpty} = 0;");
writer.WriteLine();
writer.WriteLine($"int {iterationCount} = 0, {startingPos} = pos, {sawEmpty} = 0;");

// If the min count is 0, start out by jumping right to what's after the loop. Backtracking
// will then bring us back in to do further iterations.
if (minIterations == 0)
{
writer.WriteLine($"goto {endLoop};");
}
writer.WriteLine();

// Iteration body
MarkLabel(body, emitSemicolon: false);
Expand Down Expand Up @@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has
{
if (declarations.Count != 0)
{
StringBuilder tmp = new StringBuilder().AppendLine();
var tmp = new StringBuilder();
foreach (string decl in declarations.OrderBy(s => s))
{
for (int i = 0; i < indent; i++)
Expand Down
Loading