Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Recovery of Unterminated Regular Expressions #58289

Merged
merged 11 commits into from
May 24, 2024
7 changes: 1 addition & 6 deletions src/compiler/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ import {
DeleteExpression,
Diagnostic,
DiagnosticArguments,
DiagnosticCategory,
DiagnosticMessage,
Diagnostics,
DiagnosticWithDetachedLocation,
Expand Down Expand Up @@ -2144,11 +2143,7 @@ namespace Parser {
// Don't report another error if it would just be at the same position as the last error.
const lastError = lastOrUndefined(parseDiagnostics);
let result: DiagnosticWithDetachedLocation | undefined;
if (message.category === DiagnosticCategory.Message && lastError && start === lastError.start && length === lastError.length) {
result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
addRelatedInfo(lastError, result);
}
else if (!lastError || start !== lastError.start) {
if (!lastError || start !== lastError.start) {
result = createDetachedDiagnostic(fileName, sourceText, start, length, message, ...args);
parseDiagnostics.push(result);
}
Expand Down
143 changes: 88 additions & 55 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
LanguageFeatureMinimumTarget,
LanguageVariant,
last,
lastOrUndefined,
LineAndCharacter,
MapLike,
parsePseudoBigInt,
Expand Down Expand Up @@ -2391,7 +2392,8 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
function reScanSlashToken(reportErrors?: boolean): SyntaxKind {
if (token === SyntaxKind.SlashToken || token === SyntaxKind.SlashEqualsToken) {
// Quickly get to the end of regex such that we know the flags
let p = tokenStart + 1;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is eliminated for readability

const startOfRegExpBody = tokenStart + 1;
pos = startOfRegExpBody;
let inEscape = false;
// Although nested character classes are allowed in Unicode Sets mode,
// an unescaped slash is nevertheless invalid even in a character class in Unicode mode.
Expand All @@ -2403,16 +2405,14 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
while (true) {
// If we reach the end of a file, or hit a newline, then this is an unterminated
// regex. Report error and return what we have so far.
if (p >= end) {
if (pos >= end) {
tokenFlags |= TokenFlags.Unterminated;
error(Diagnostics.Unterminated_regular_expression_literal);
break;
}

const ch = text.charCodeAt(p);
const ch = text.charCodeAt(pos);
if (isLineBreak(ch)) {
tokenFlags |= TokenFlags.Unterminated;
error(Diagnostics.Unterminated_regular_expression_literal);
break;
}

Expand All @@ -2424,7 +2424,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
else if (ch === CharacterCodes.slash && !inCharacterClass) {
// A slash within a character class is permissible,
// but in general it signals the end of the regexp literal.
p++;
break;
}
else if (ch === CharacterCodes.openBracket) {
Expand All @@ -2436,58 +2435,95 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
else if (ch === CharacterCodes.closeBracket) {
inCharacterClass = false;
}
p++;
pos++;
}
const isUnterminated = !!(tokenFlags & TokenFlags.Unterminated);
const endOfBody = p - (isUnterminated ? 0 : 1);
let regExpFlags = RegularExpressionFlags.None;
while (p < end) {
const ch = text.charCodeAt(p);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
if (reportErrors) {
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, p, 1);
const endOfRegExpBody = pos;
if (tokenFlags & TokenFlags.Unterminated) {
rbuckton marked this conversation as resolved.
Show resolved Hide resolved
// Search for the nearest unbalanced bracket for better recovery. Since the expression is
// invalid anyways, we take nested square brackets into consideration for the best guess.
pos = startOfRegExpBody;
inEscape = false;
let characterClassDepth = 0;
const bracketStack: CharacterCodes[] = [];
while (pos < endOfRegExpBody) {
const ch = text.charCodeAt(pos);
if (inEscape) {
inEscape = false;
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, p, 1);
else if (ch === CharacterCodes.backslash) {
inEscape = true;
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1);
else if (ch === CharacterCodes.openBracket) {
characterClassDepth++;
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailable(flag, p);
else if (ch === CharacterCodes.closeBracket && characterClassDepth) {
characterClassDepth--;
}
else if (!characterClassDepth) {
if (ch === CharacterCodes.openParen) {
bracketStack.push(CharacterCodes.closeParen);
}
else if (ch === CharacterCodes.openBrace) {
bracketStack.push(CharacterCodes.closeBrace);
}
else if (ch === lastOrUndefined(bracketStack)) {
bracketStack.pop();
}
rbuckton marked this conversation as resolved.
Show resolved Hide resolved
else if (ch === CharacterCodes.closeParen || ch === CharacterCodes.closeBracket || ch === CharacterCodes.closeBrace) {
// We encountered an unbalanced bracket outside a character class. Treat this position as the end of regex.
break;
}
}
pos++;
}
p++;
}
if (reportErrors) {
pos = tokenStart + 1;
const saveTokenPos = tokenStart;
const saveTokenFlags = tokenFlags;
scanRegularExpressionWorker(text, endOfBody, regExpFlags, isUnterminated, /*annexB*/ true);
if (!isUnterminated) {
pos = p;
}
tokenStart = saveTokenPos;
tokenFlags = saveTokenFlags;
// Whitespaces and semicolons at the end are not likely to be part of the regex
while (isWhiteSpaceLike(text.charCodeAt(pos - 1)) || text.charCodeAt(pos - 1) === CharacterCodes.semicolon) pos--;
error(Diagnostics.Unterminated_regular_expression_literal, tokenStart, pos - tokenStart);
}
else {
pos = p;
// Consume the slash character
pos++;
let regExpFlags = RegularExpressionFlags.None;
while (pos < end) {
const ch = codePointAt(text, pos);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const size = charSize(ch);
if (reportErrors) {
const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
rbuckton marked this conversation as resolved.
Show resolved Hide resolved
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailability(flag);
}
}
pos += size;
}
if (reportErrors) {
scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
scanRegularExpressionWorker(regExpFlags, /*annexB*/ true);
});
}
}
tokenValue = text.substring(tokenStart, pos);
token = SyntaxKind.RegularExpressionLiteral;
}
return token;

function scanRegularExpressionWorker(text: string, end: number, regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean) {
/** Grammar parameter */
const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);
function scanRegularExpressionWorker(regExpFlags: RegularExpressionFlags, annexB: boolean) {
/** Grammar parameter */
const unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);
/** Grammar parameter */
const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);

if (unicodeMode) {
// Annex B treats any unicode mode as the strict syntax.
Expand All @@ -2497,7 +2533,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
/** @see {scanClassSetExpression} */
let mayContainStrings = false;

/** The number of numeric (anonymous) capturing groups defined in the regex. */
/** The number of all (named and unnamed) capturing groups defined in the regex. */
let numberOfCapturingGroups = 0;
/** All named capturing groups defined in the regex. */
const groupSpecifiers = new Set<string>();
Expand Down Expand Up @@ -2698,10 +2734,6 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// falls through
case CharacterCodes.closeBracket:
case CharacterCodes.closeBrace:
if (isUnterminated && !isInGroup) {
// Assume what starting from the character to be outside of the regex
return;
}
if (unicodeMode || ch === CharacterCodes.closeParen) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
}
Expand All @@ -2721,25 +2753,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean

function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
while (pos < end) {
const ch = text.charCodeAt(pos);
const ch = codePointAt(text, pos);
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const size = charSize(ch);
const flag = characterToRegularExpressionFlag(utf16EncodeAsString(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (currFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (!(flag & RegularExpressionFlags.Modifiers)) {
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
}
else {
currFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag);
}
pos++;
pos += size;
}
return currFlags;
}
Expand Down Expand Up @@ -3439,7 +3472,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
});
}

function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags) {
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
if (availableFrom && languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
Expand Down
1 change: 1 addition & 0 deletions src/testRunner/tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import "./unittests/paths";
import "./unittests/printer";
import "./unittests/programApi";
import "./unittests/publicApi";
import "./unittests/regExpParserRecovery";
import "./unittests/reuseProgramStructure";
import "./unittests/semver";
import "./unittests/transform";
Expand Down
2 changes: 1 addition & 1 deletion src/testRunner/unittests/incrementalParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ describe("unittests:: Incremental Parser", () => {
const oldText = ts.ScriptSnapshot.fromString(source);
const newTextAndChange = withInsert(oldText, semicolonIndex, "/");

compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 0);
compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 4);
});

it("Regular expression 2", () => {
Expand Down
81 changes: 81 additions & 0 deletions src/testRunner/unittests/regExpParserRecovery.ts
graphemecluster marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import * as ts from "../_namespaces/ts";

describe("unittests:: regExpParserRecovery", () => {
const testCases = [
"/",
"/[]",
"/{}",
"/()",
"/foo",
"/foo[]",
"/foo{}",
"/foo()",
"/[]foo",
"/{}foo",
"/()foo",
"/{[]}",
"/([])",
"/[)}({]",
"/({[)}]})",
"/\\[",
"/\\{",
"/\\(",
"/[\\[]",
"/(\\[)",
"/{\\[}",
"/[\\(]",
"/(\\()",
"/{\\(}",
"/[\\{]",
"/(\\{)",
"/{\\{}",
"/\\{(\\[\\([{])",
"/\\]",
"/\\}",
"/\\)",
"/[\\]]",
"/(\\])",
"/{\\]}",
"/[\\)]",
"/(\\))",
"/{\\)}",
"/[\\}]",
"/(\\})",
"/{\\}}",
"/({[\\]})]})",
];
const whiteSpaceSequences = [
"",
" ",
"\t\v\r\n",
"\u3000\u2028",
];
it("stops parsing unterminated regexes at correct position", () => {
graphemecluster marked this conversation as resolved.
Show resolved Hide resolved
ts.forEach(testCases, testCase => {
ts.forEach(whiteSpaceSequences, whiteSpaces => {
const testCaseWithWhiteSpaces = testCase + whiteSpaces;
const sources = [
`const regex = ${testCaseWithWhiteSpaces};`,
`(${testCaseWithWhiteSpaces});`,
`([${testCaseWithWhiteSpaces}]);`,
`({prop: ${testCaseWithWhiteSpaces}});`,
`({prop: ([(${testCaseWithWhiteSpaces})])});`,
`({[(${testCaseWithWhiteSpaces}).source]: 42});`,
];
ts.forEach(sources, source => {
const { parseDiagnostics } = ts.createLanguageServiceSourceFile(
/*fileName*/ "",
ts.ScriptSnapshot.fromString(source),
ts.ScriptTarget.Latest,
/*version*/ "0",
/*setNodeParents*/ false,
);
const diagnostic = ts.find(parseDiagnostics, ({ code }) => code === ts.Diagnostics.Unterminated_regular_expression_literal.code);
assert(diagnostic, "There should be an 'Unterminated regular expression literal.' error");
assert.equal(diagnostic.start, source.indexOf("/"), "Diagnostic should start at where the regex starts");
assert.equal(diagnostic.length, testCase.length, "Diagnostic should end at where the regex ends");
});
});
});
});
});
7 changes: 2 additions & 5 deletions tests/baselines/reference/parser645086_1.errors.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
parser645086_1.ts(1,13): error TS1005: ',' expected.
parser645086_1.ts(1,14): error TS1134: Variable declaration expected.
parser645086_1.ts(1,15): error TS1161: Unterminated regular expression literal.


==== parser645086_1.ts (3 errors) ====
==== parser645086_1.ts (2 errors) ====
var v = /[]/]/
~
!!! error TS1005: ',' expected.
~
!!! error TS1134: Variable declaration expected.

!!! error TS1161: Unterminated regular expression literal.
!!! error TS1134: Variable declaration expected.
7 changes: 2 additions & 5 deletions tests/baselines/reference/parser645086_2.errors.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
parser645086_2.ts(1,14): error TS1005: ',' expected.
parser645086_2.ts(1,15): error TS1134: Variable declaration expected.
parser645086_2.ts(1,16): error TS1161: Unterminated regular expression literal.


==== parser645086_2.ts (3 errors) ====
==== parser645086_2.ts (2 errors) ====
var v = /[^]/]/
~
!!! error TS1005: ',' expected.
~
!!! error TS1134: Variable declaration expected.

!!! error TS1161: Unterminated regular expression literal.
!!! error TS1134: Variable declaration expected.
Loading