From 531514048a1dbfabe9ae983dab49bb4239589469 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Sat, 1 May 2021 18:01:07 +0200 Subject: [PATCH] Tests: Exhaustive pattern tests (#2688) --- components/prism-erb.js | 2 +- components/prism-erb.min.js | 2 +- tests/pattern-tests.js | 846 ++++++++++++++++++++++-------------- 3 files changed, 511 insertions(+), 339 deletions(-) diff --git a/components/prism-erb.js b/components/prism-erb.js index 132f4f56d1..989d56f35f 100644 --- a/components/prism-erb.js +++ b/components/prism-erb.js @@ -9,7 +9,7 @@ }); Prism.hooks.add('before-tokenize', function (env) { - var erbPattern = /<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s[\s\S]*?^=end)+?%>/gm; + var erbPattern = /<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s(?:[^\r\n]|[\r\n](?!=end))*[\r\n]=end)+?%>/gm; Prism.languages['markup-templating'].buildPlaceholders(env, 'erb', erbPattern); }); diff --git a/components/prism-erb.min.js b/components/prism-erb.min.js index 25d3c27b6b..a9ca5e42a1 100644 --- a/components/prism-erb.min.js +++ b/components/prism-erb.min.js @@ -1 +1 @@ -!function(n){n.languages.erb=n.languages.extend("ruby",{}),n.languages.insertBefore("erb","comment",{delimiter:{pattern:/^<%=?|%>$/,alias:"punctuation"}}),n.hooks.add("before-tokenize",function(e){n.languages["markup-templating"].buildPlaceholders(e,"erb",/<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s[\s\S]*?^=end)+?%>/gm)}),n.hooks.add("after-tokenize",function(e){n.languages["markup-templating"].tokenizePlaceholders(e,"erb")})}(Prism); \ No newline at end of file +!function(n){n.languages.erb=n.languages.extend("ruby",{}),n.languages.insertBefore("erb","comment",{delimiter:{pattern:/^<%=?|%>$/,alias:"punctuation"}}),n.hooks.add("before-tokenize",function(e){n.languages["markup-templating"].buildPlaceholders(e,"erb",/<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s(?:[^\r\n]|[\r\n](?!=end))*[\r\n]=end)+?%>/gm)}),n.hooks.add("after-tokenize",function(e){n.languages["markup-templating"].tokenizePlaceholders(e,"erb")})}(Prism); \ No newline at end of file diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js index bf1ac4fcdd..90535e4489 100644 --- a/tests/pattern-tests.js +++ b/tests/pattern-tests.js @@ -1,26 +1,41 @@ +// @ts-check 'use strict'; const { assert } = require('chai'); const PrismLoader = require('./helper/prism-loader'); +const TestDiscovery = require('./helper/test-discovery'); +const TestCase = require('./helper/test-case'); const { BFS, parseRegex } = require('./helper/util'); const { languages } = require('../components.json'); const { visitRegExpAST } = require('regexpp'); const { transform, combineTransformers, JS, Words, NFA, Transformers } = require('refa'); const scslre = require('scslre'); +const path = require('path'); /** - * A set of all safe (non-exponentially backtracking) RegExp literals (string). + * A map from language id to a list of code snippets in that language. * - * @type {Set} + * @type {Map} */ -const expoSafeRegexes = new Set(); +const testSnippets = new Map(); +const testSuite = TestDiscovery.loadAllTests(__dirname + '/languages'); +for (const languageIdentifier in testSuite) { + const lang = TestCase.parseLanguageNames(languageIdentifier).mainLanguage; + let snippets = testSnippets.get(lang); + if (snippets === undefined) { + snippets = []; + testSnippets.set(lang, snippets); + } + + for (const file of testSuite[languageIdentifier]) { + if (path.extname(file) === '.test') { + snippets.push(TestCase.parseTestCaseFile(file).code); + } else { + snippets.push(...Object.keys(require(file))); + } + } +} -/** - * A set of all safe (non-polynomially backtracking) RegExp literals (string). - * - * @type {Set} - */ -const polySafeRegexes = new Set(); for (const lang in languages) { if (lang === 'meta') { @@ -29,19 +44,9 @@ for (const lang in languages) { describe(`Patterns of '${lang}'`, function () { const Prism = PrismLoader.createInstance(lang); - testPatterns(Prism); + testPatterns(Prism, lang); }); - function toArray(value) { - if (Array.isArray(value)) { - return value; - } else if (value != null) { - return [value]; - } else { - return []; - } - } - let optional = toArray(languages[lang].optional); let modify = toArray(languages[lang].modify); @@ -56,7 +61,7 @@ for (const lang in languages) { describe(name, function () { const Prism = PrismLoader.createInstance([...optional, ...modify, lang]); - testPatterns(Prism); + testPatterns(Prism, lang); }); } } @@ -65,6 +70,7 @@ for (const lang in languages) { * Tests all patterns in the given Prism instance. * * @param {any} Prism + * @param {string} mainLanguage * * @typedef {import("./helper/util").LiteralAST} LiteralAST * @typedef {import("regexpp/ast").CapturingGroup} CapturingGroup @@ -73,7 +79,40 @@ for (const lang in languages) { * @typedef {import("regexpp/ast").LookaroundAssertion} LookaroundAssertion * @typedef {import("regexpp/ast").Pattern} Pattern */ -function testPatterns(Prism) { +function testPatterns(Prism, mainLanguage) { + + /** + * Returns a list of relevant languages in the Prism instance. + * + * The list does not included readonly dependencies and aliases. + * + * @returns {string[]} + */ + function getRelevantLanguages() { + return [mainLanguage, ...toArray(languages[mainLanguage].modify)] + .filter(lang => lang in Prism.languages); + } + + /** + * @param {string} root + * @param {Parameters[1]>[0]} path + * @returns {string} + */ + function BFSPathToString(root, path) { + let pathStr = root; + for (const { key } of path) { + if (!key) { + // do nothing + } else if (/^\d+$/.test(key)) { + pathStr += `[${key}]`; + } else if (/^[a-z]\w*$/i.test(key)) { + pathStr += `.${key}`; + } else { + pathStr += `[${JSON.stringify(key)}]`; + } + } + return pathStr; + } /** * Invokes the given function on every pattern in `Prism.languages`. @@ -94,49 +133,73 @@ function testPatterns(Prism) { * @property {(message: string) => void} reportError */ function forEachPattern(callback) { + const visited = new Set(); const errors = []; - BFS(Prism.languages, path => { - const { key, value } = path[path.length - 1]; - - let tokenPath = 'Prism.languages'; - for (const { key } of path) { - if (!key) { - // do nothing - } else if (/^\d+$/.test(key)) { - tokenPath += `[${key}]`; - } else if (/^[a-z]\w*$/i.test(key)) { - tokenPath += `.${key}`; - } else { - tokenPath += `[${JSON.stringify(key)}]`; - } + /** + * @param {object} root + * @param {string} rootStr + */ + function traverse(root, rootStr) { + if (visited.has(root)) { + return; } + visited.add(root); - if (Object.prototype.toString.call(value) == '[object RegExp]') { - try { - let ast; + BFS(root, path => { + const { key, value } = path[path.length - 1]; + visited.add(value); + + const tokenPath = BFSPathToString(rootStr, path); + + if (Object.prototype.toString.call(value) == '[object RegExp]') { try { - ast = parseRegex(value); + let ast; + try { + ast = parseRegex(value); + } catch (error) { + throw new SyntaxError(`Invalid RegExp at ${tokenPath}\n\n${error.message}`); + } + + const parent = path.length > 1 ? path[path.length - 2].value : undefined; + callback({ + pattern: value, + ast, + tokenPath, + name: key, + parent, + path, + lookbehind: key === 'pattern' && parent && !!parent.lookbehind, + reportError: message => errors.push(message) + }); } catch (error) { - throw new SyntaxError(`Invalid RegExp at ${tokenPath}\n\n${error.message}`); + errors.push(error); } - - const parent = path.length > 1 ? path[path.length - 2].value : undefined; - callback({ - pattern: value, - ast, - tokenPath, - name: key, - parent, - path, - lookbehind: key === 'pattern' && parent && !!parent.lookbehind, - reportError: message => errors.push(message) - }); - } catch (error) { - errors.push(error); } + }); + } + + // static analysis + traverse(Prism.languages, 'Prism.languages'); + + // dynamic analysis + for (const lang of getRelevantLanguages()) { + const snippets = testSnippets.get(lang); + const grammar = Prism.languages[lang]; + + const oldTokenize = Prism.tokenize; + Prism.tokenize = function (_, grammar) { + const result = oldTokenize.apply(this, arguments); + traverse(grammar, lang + ': '); + return result; + }; + + for (const snippet of (snippets || [])) { + Prism.highlight(snippet, grammar, lang); } - }); + + Prism.tokenize = oldTokenize; + } if (errors.length > 0) { throw new Error(errors.map(e => String(e.message || e)).join('\n\n')); @@ -165,91 +228,6 @@ function testPatterns(Prism) { }); } - /** - * Returns whether the given element will always have zero width meaning that it doesn't consume characters. - * - * @param {Element} element - * @returns {boolean} - */ - function isAlwaysZeroWidth(element) { - switch (element.type) { - case 'Assertion': - // assertions == ^, $, \b, lookarounds - return true; - case 'Quantifier': - return element.max === 0 || isAlwaysZeroWidth(element.element); - case 'CapturingGroup': - case 'Group': - // every element in every alternative has to be of zero length - return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth)); - case 'Backreference': - // on if the group referred to is of zero length - return isAlwaysZeroWidth(element.resolved); - default: - return false; // what's left are characters - } - } - - /** - * Returns whether the given element will always at the start of the whole match. - * - * @param {Element} element - * @returns {boolean} - */ - function isFirstMatch(element) { - const parent = element.parent; - switch (parent.type) { - case 'Alternative': { - // all elements before this element have to of zero length - if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) { - return false; - } - const grandParent = parent.parent; - if (grandParent.type === 'Pattern') { - return true; - } else { - return isFirstMatch(grandParent); - } - } - - case 'Quantifier': - if (parent.max >= 2) { - return false; - } else { - return isFirstMatch(parent); - } - - default: - throw new Error(`Internal error: The given node should not be a '${element.type}'.`); - } - } - - /** - * Returns whether the given node either is or is a child of what is effectively a Kleene star. - * - * @param {import("regexpp/ast").Node} node - * @returns {boolean} - */ - function underAStar(node) { - if (node.type === 'Quantifier' && node.max > 10) { - return true; - } else if (node.parent) { - return underAStar(node.parent); - } else { - return false; - } - } - - /** - * @param {Iterable} iter - * @returns {T | undefined} - * @template T - */ - function firstOf(iter) { - const [first] = iter; - return first; - } - it('- should not match the empty string', function () { forEachPattern(({ pattern, tokenPath }) => { @@ -384,221 +362,370 @@ function testPatterns(Prism) { }); it('- should not cause exponential backtracking', function () { - /** @type {Transformers.CreationOptions} */ - const options = { - ignoreOrder: true, - ignoreAmbiguity: true - }; - const transformer = combineTransformers([ - Transformers.inline(options), - Transformers.removeDeadBranches(options), - Transformers.unionCharacters(options), - Transformers.moveUpEmpty(options), - Transformers.nestedQuantifiers(options), - Transformers.sortAssertions(options), - Transformers.removeUnnecessaryAssertions(options), - Transformers.applyAssertions(options), - ]); - - forEachPattern(({ pattern, ast, tokenPath }) => { - const patternStr = String(pattern); - if (expoSafeRegexes.has(patternStr)) { - // we know that the pattern won't cause exp backtracking because we checked before - return; - } + replaceRegExpProto(exec => { + return function (input) { + checkExponentialBacktracking('', this); + return exec.call(this, input); + }; + }, () => { + forEachPattern(({ pattern, ast, tokenPath }) => { + checkExponentialBacktracking(tokenPath, pattern, ast); + }); + }); + }); - const parser = JS.Parser.fromAst(ast); - /** - * Parses the given element and creates its NFA. - * - * @param {import("refa").JS.ParsableElement} element - * @returns {NFA} - */ - function toNFA(element) { - let { expression, maxCharacter } = parser.parseElement(element, { - maxBackreferenceWords: 1000, - backreferences: 'disable' - }); + it('- should not cause polynomial backtracking', function () { + replaceRegExpProto(exec => { + return function (input) { + checkPolynomialBacktracking('', this); + return exec.call(this, input); + }; + }, () => { + forEachPattern(({ pattern, ast, tokenPath }) => { + checkPolynomialBacktracking(tokenPath, pattern, ast); + }); + }); + }); - // try to remove assertions - expression = transform(transformer, expression); +} - return NFA.fromRegex(expression, { maxCharacter }, { assertions: 'disable' }); - } - /** - * Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint - * and the give node is a descendant of an effective Kleene star, then an error will be thrown. - * - * @param {CapturingGroup | Group | LookaroundAssertion} node - * @returns {void} - */ - function checkDisjointAlternatives(node) { - if (!underAStar(node) || node.alternatives.length < 2) { - return; - } +/** + * Returns whether the given element will always have zero width meaning that it doesn't consume characters. + * + * @param {Element} element + * @returns {boolean} + */ +function isAlwaysZeroWidth(element) { + switch (element.type) { + case 'Assertion': + // assertions == ^, $, \b, lookarounds + return true; + case 'Quantifier': + return element.max === 0 || isAlwaysZeroWidth(element.element); + case 'CapturingGroup': + case 'Group': + // every element in every alternative has to be of zero length + return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth)); + case 'Backreference': + // on if the group referred to is of zero length + return isAlwaysZeroWidth(element.resolved); + default: + return false; // what's left are characters + } +} - const alternatives = node.alternatives; - - const total = toNFA(alternatives[0]); - total.withoutEmptyWord(); - for (let i = 1, l = alternatives.length; i < l; i++) { - const a = alternatives[i]; - const current = toNFA(a); - current.withoutEmptyWord(); - - if (!total.isDisjointWith(current)) { - assert.fail(`${tokenPath}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.` - + ` This will cause exponential backtracking.` - + `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.` - + ` The goal is that all of its alternatives are disjoint.` - + ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.` - + `\n\nExample: \`(?:[ab]|\\w|::)+\`` - + `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.` - + ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.` - + `\nIn the real world, patterns can be a lot harder to fix.` - + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.` - + ` A maintainer will help you.` - + `\n\nFull pattern:\n${pattern}`); - } else if (i !== l - 1) { - total.union(current); - } - } +/** + * Returns whether the given element will always at the start of the whole match. + * + * @param {Element} element + * @returns {boolean} + */ +function isFirstMatch(element) { + const parent = element.parent; + switch (parent.type) { + case 'Alternative': { + // all elements before this element have to of zero length + if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) { + return false; } + const grandParent = parent.parent; + if (grandParent.type === 'Pattern') { + return true; + } else { + return isFirstMatch(grandParent); + } + } - visitRegExpAST(ast.pattern, { - onCapturingGroupLeave: checkDisjointAlternatives, - onGroupLeave: checkDisjointAlternatives, - onAssertionLeave(node) { - if (node.kind === 'lookahead' || node.kind === 'lookbehind') { - checkDisjointAlternatives(node); - } - }, + case 'Quantifier': + if (parent.max >= 2) { + return false; + } else { + return isFirstMatch(parent); + } - onQuantifierLeave(node) { - if (node.max < 10) { - return; // not a star - } - if (node.element.type !== 'CapturingGroup' && node.element.type !== 'Group') { - return; // not a group - } + default: + throw new Error(`Internal error: The given node should not be a '${element.type}'.`); + } +} - // The idea here is the following: - // - // We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be - // the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w` - // that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w` - // is accepted by `A{m}`. - // This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as - // `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main - // requirement for exponential backtracking. - // - // This is actually only a crude approximation for the real analysis that would have to be done. We - // would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most - // cases, the approximation is good enough. - - const nfa = toNFA(node.element); - nfa.withoutEmptyWord(); - const twoStar = nfa.copy(); - twoStar.quantify(2, Infinity); - - if (!nfa.isDisjointWith(twoStar)) { - const word = Words.pickMostReadableWord(firstOf(nfa.intersectionWordSets(twoStar))); - const example = Words.fromUnicodeToString(word); - assert.fail(`${tokenPath}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.` - + ` This will cause exponential backtracking.` - + `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.` - + ` The goal is modify E such that it is disjoint with repetitions of itself.` - + ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.` - + `\n\nExample 1: \`(?:\\w+|::)+\`` - + `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.` - + ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.` - + `\nExample 2: \`(?:\\w|Foo)+\`` - + `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.` - + ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.` - + `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\`` - + `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.` - + ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.` - + ` This means that the lazy \`.*?\` can jump over \`>\`.` - + ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.` - + ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.` - + `\n\nIn the real world, patterns can be a lot harder to fix.` - + ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.` - + `\n\nFull pattern:\n${pattern}`); - } - }, - }); +/** + * Returns whether the given node either is or is a child of what is effectively a Kleene star. + * + * @param {import("regexpp/ast").Node} node + * @returns {boolean} + */ +function underAStar(node) { + if (node.type === 'Quantifier' && node.max > 10) { + return true; + } else if (node.parent) { + return underAStar(node.parent); + } else { + return false; + } +} + +/** + * @param {Iterable} iter + * @returns {T | undefined} + * @template T + */ +function firstOf(iter) { + const [first] = iter; + return first; +} + +/** + * A set of all safe (non-exponentially backtracking) RegExp literals (string). + * + * @type {Set} + */ +const expoSafeRegexes = new Set(); - expoSafeRegexes.add(patternStr); +/** @type {Transformers.CreationOptions} */ +const options = { + ignoreOrder: true, + ignoreAmbiguity: true +}; +const transformer = combineTransformers([ + Transformers.inline(options), + Transformers.removeDeadBranches(options), + Transformers.unionCharacters(options), + Transformers.moveUpEmpty(options), + Transformers.nestedQuantifiers(options), + Transformers.sortAssertions(options), + Transformers.removeUnnecessaryAssertions(options), + Transformers.applyAssertions(options), +]); + + +/** + * @param {string} path + * @param {RegExp} pattern + * @param {LiteralAST} [ast] + * @returns {void} + */ +function checkExponentialBacktracking(path, pattern, ast) { + if (expoSafeRegexes.has(pattern)) { + // we know that the pattern won't cause exp backtracking because we checked before + return; + } + const patternStr = String(pattern); + if (expoSafeRegexes.has(patternStr)) { + // we know that the pattern won't cause exp backtracking because we checked before + return; + } + + if (!ast) { + ast = parseRegex(pattern); + } + + const parser = JS.Parser.fromAst(ast); + /** + * Parses the given element and creates its NFA. + * + * @param {import("refa").JS.ParsableElement} element + * @returns {NFA} + */ + function toNFA(element) { + let { expression, maxCharacter } = parser.parseElement(element, { + maxBackreferenceWords: 1000, + backreferences: 'disable' }); - }); - it('- should not cause polynomial backtracking', function () { - forEachPattern(({ pattern, ast, tokenPath }) => { - const patternStr = String(pattern); - if (polySafeRegexes.has(patternStr)) { - // we know that the pattern won't cause poly backtracking because we checked before - return; + return NFA.fromRegex(transform(transformer, expression), { maxCharacter }, { assertions: 'disable' }); + } + + /** + * Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint + * and the give node is a descendant of an effective Kleene star, then an error will be thrown. + * + * @param {CapturingGroup | Group | LookaroundAssertion} node + * @returns {void} + */ + function checkDisjointAlternatives(node) { + if (!underAStar(node) || node.alternatives.length < 2) { + return; + } + + const alternatives = node.alternatives; + + const total = toNFA(alternatives[0]); + total.withoutEmptyWord(); + for (let i = 1, l = alternatives.length; i < l; i++) { + const a = alternatives[i]; + const current = toNFA(a); + current.withoutEmptyWord(); + + if (!total.isDisjointWith(current)) { + assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.` + + ` This will cause exponential backtracking.` + + `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.` + + ` The goal is that all of its alternatives are disjoint.` + + ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.` + + `\n\nExample: \`(?:[ab]|\\w|::)+\`` + + `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.` + + ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.` + + `\nIn the real world, patterns can be a lot harder to fix.` + + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.` + + ` A maintainer will help you.` + + `\n\nFull pattern:\n${pattern}`); + } else if (i !== l - 1) { + total.union(current); } + } + } - const result = scslre.analyse(ast, { maxReports: 1, reportTypes: { 'Move': false } }); - if (result.reports.length > 0) { - const report = result.reports[0]; - - let rangeOffset; - let rangeStr; - let rangeHighlight; - - switch (report.type) { - case 'Trade': { - const start = Math.min(report.startQuant.start, report.endQuant.start); - const end = Math.max(report.startQuant.end, report.endQuant.end); - rangeOffset = start + 1; - rangeStr = patternStr.substring(start + 1, end + 1); - rangeHighlight = highlight([ - { ...report.startQuant, label: 'start' }, - { ...report.endQuant, label: 'end' } - ], -start); - break; - } - case 'Self': { - rangeOffset = report.parentQuant.start + 1; - rangeStr = patternStr.substring(report.parentQuant.start + 1, report.parentQuant.end + 1); - rangeHighlight = highlight([{ ...report.quant, label: 'self' }], -report.parentQuant.start); - break; - } - case 'Move': { - rangeOffset = 1; - rangeStr = patternStr.substring(1, report.quant.end + 1); - rangeHighlight = highlight([report.quant]); - break; - } - default: - throw new Error('Invalid report type "' + report.type + '". This should never happen.'); - } + visitRegExpAST(ast.pattern, { + onCapturingGroupLeave: checkDisjointAlternatives, + onGroupLeave: checkDisjointAlternatives, + onAssertionLeave(node) { + if (node.kind === 'lookahead' || node.kind === 'lookbehind') { + checkDisjointAlternatives(node); + } + }, - const attackChar = `/${report.character.literal.source}/${report.character.literal.flags}`; - const fixed = report.fix(); - - assert.fail( - `${tokenPath}: ${report.exponential ? 'Exponential' : 'Polynomial'} backtracking. ` - + `By repeating any character that matches ${attackChar}, an attack string can be created.` - + `\n` - + `\n${indent(rangeStr)}` - + `\n${indent(rangeHighlight)}` - + `\n` - + `\nFull pattern:` - + `\n${patternStr}` - + `\n${indent(rangeHighlight, ' '.repeat(rangeOffset))}` - + `\n` - + `\n` + (fixed ? `Fixed:\n/${fixed.source}/${fixed.flags}` : `Fix not available.`) - ); + onQuantifierLeave(node) { + if (node.max < 10) { + return; // not a star + } + if (node.element.type !== 'CapturingGroup' && node.element.type !== 'Group') { + return; // not a group } - polySafeRegexes.add(patternStr); - }); + // The idea here is the following: + // + // We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be + // the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w` + // that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w` + // is accepted by `A{m}`. + // This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as + // `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main + // requirement for exponential backtracking. + // + // This is actually only a crude approximation for the real analysis that would have to be done. We + // would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most + // cases, the approximation is good enough. + + const nfa = toNFA(node.element); + nfa.withoutEmptyWord(); + const twoStar = nfa.copy(); + twoStar.quantify(2, Infinity); + + if (!nfa.isDisjointWith(twoStar)) { + const word = Words.pickMostReadableWord(firstOf(nfa.intersectionWordSets(twoStar))); + const example = Words.fromUnicodeToString(word); + assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.` + + ` This will cause exponential backtracking.` + + `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.` + + ` The goal is modify E such that it is disjoint with repetitions of itself.` + + ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.` + + `\n\nExample 1: \`(?:\\w+|::)+\`` + + `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.` + + ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.` + + `\nExample 2: \`(?:\\w|Foo)+\`` + + `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.` + + ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.` + + `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\`` + + `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.` + + ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.` + + ` This means that the lazy \`.*?\` can jump over \`>\`.` + + ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.` + + ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.` + + `\n\nIn the real world, patterns can be a lot harder to fix.` + + ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.` + + `\n\nFull pattern:\n${pattern}`); + } + }, }); + expoSafeRegexes.add(pattern); + expoSafeRegexes.add(patternStr); +} + +/** + * A set of all safe (non-polynomially backtracking) RegExp literals (string). + * + * @type {Set} + */ +const polySafeRegexes = new Set(); +/** + * @param {string} path + * @param {RegExp} pattern + * @param {LiteralAST} [ast] + * @returns {void} + */ +function checkPolynomialBacktracking(path, pattern, ast) { + if (polySafeRegexes.has(pattern)) { + // we know that the pattern won't cause poly backtracking because we checked before + return; + } + const patternStr = String(pattern); + if (polySafeRegexes.has(patternStr)) { + // we know that the pattern won't cause poly backtracking because we checked before + return; + } + + if (!ast) { + ast = parseRegex(pattern); + } + + const result = scslre.analyse(ast, { maxReports: 1, reportTypes: { 'Move': false } }); + if (result.reports.length > 0) { + const report = result.reports[0]; + + let rangeOffset; + let rangeStr; + let rangeHighlight; + + switch (report.type) { + case 'Trade': { + const start = Math.min(report.startQuant.start, report.endQuant.start); + const end = Math.max(report.startQuant.end, report.endQuant.end); + rangeOffset = start + 1; + rangeStr = patternStr.substring(start + 1, end + 1); + rangeHighlight = highlight([ + { ...report.startQuant, label: 'start' }, + { ...report.endQuant, label: 'end' } + ], -start); + break; + } + case 'Self': { + rangeOffset = report.parentQuant.start + 1; + rangeStr = patternStr.substring(report.parentQuant.start + 1, report.parentQuant.end + 1); + rangeHighlight = highlight([{ ...report.quant, label: 'self' }], -report.parentQuant.start); + break; + } + case 'Move': { + rangeOffset = 1; + rangeStr = patternStr.substring(1, report.quant.end + 1); + rangeHighlight = highlight([report.quant]); + break; + } + default: + throw new Error('Invalid report type. This should never happen.'); + } + + const attackChar = `/${report.character.literal.source}/${report.character.literal.flags}`; + const fixed = report.fix(); + + assert.fail( + `${path}: ${report.exponential ? 'Exponential' : 'Polynomial'} backtracking. ` + + `By repeating any character that matches ${attackChar}, an attack string can be created.` + + `\n` + + `\n${indent(rangeStr)}` + + `\n${indent(rangeHighlight)}` + + `\n` + + `\nFull pattern:` + + `\n${patternStr}` + + `\n${indent(rangeHighlight, ' '.repeat(rangeOffset))}` + + `\n` + + `\n` + (fixed ? `Fixed:\n/${fixed.source}/${fixed.flags}` : `Fix not available.`) + ); + } + + polySafeRegexes.add(pattern); + polySafeRegexes.add(patternStr); } /** @@ -609,6 +736,7 @@ function testPatterns(Prism) { * @typedef Highlight * @property {number} start * @property {number} end + * @property {string} [label] */ function highlight(highlights, offset = 0) { highlights.sort((a, b) => a.start - b.start); @@ -646,3 +774,47 @@ function highlight(highlights, offset = 0) { function indent(str, amount = ' ') { return str.split(/\r?\n/g).map(m => m === '' ? '' : amount + m).join('\n'); } + +/** + * @param {(exec: RegExp["exec"]) => RegExp["exec"]} execSupplier + * @param {() => void} fn + */ +function replaceRegExpProto(execSupplier, fn) { + const oldExec = RegExp.prototype.exec; + const oldTest = RegExp.prototype.test; + const newExec = execSupplier(oldExec); + + RegExp.prototype.exec = newExec; + RegExp.prototype.test = function (input) { + return newExec.call(this, input) !== null; + }; + + let error; + try { + fn(); + } catch (e) { + error = e; + } + + RegExp.prototype.exec = oldExec; + RegExp.prototype.test = oldTest; + + if (error) { + throw error; + } +} + +/** + * @param {undefined | null | T | T[]} value + * @returns {T[]} + * @template T + */ +function toArray(value) { + if (Array.isArray(value)) { + return value; + } else if (value != null) { + return [value]; + } else { + return []; + } +}