Parser: Make attribute parsing possessive (#12342)

* Parser: Make attribute parsing possessive Bug introduced in #11369 Someone discovered high CPU usage due to catastrophic backtracking on an invalid block comment delimiter. The following input crashed the parser on the server: ```html  ``` The optimization introduced in #11369 ended up opening a place for backtracking that shouldn't be there. In this patch we're grouping the attribute parsing section of the tokenizing RegExp pattern so that we can make the group itself _possessive_ so that we abort any backtracking. * add test and fix broken fix * really fix default JS parser * add explanatory comment * add @SInCE comment with updated target version * version bumps
WordPress · Nov 30, 2018 · 89e9ef9 · 89e9ef9
1 parent 7d2c994
commit 89e9ef9
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 3 deletions.
diff --git a/packages/block-serialization-default-parser/parser.php b/packages/block-serialization-default-parser/parser.php
@@ -359,6 +359,7 @@ function proceed() {
 	 *
 	 * @internal
 	 * @since 3.8.0
+	 * @since 4.6.1 fixed a bug in attribute parsing which caused catastrophic backtracking on invalid block comments
 	 * @return array
 	 */
 	function next_token() {
@@ -373,13 +374,18 @@ function next_token() {
 		 * match back in PHP to see which one it was.
 		 */
 		$has_match = preg_match(
-			'/<!--\s+(?<closer>\/)?wp:(?<namespace>[a-z][a-z0-9_-]*\/)?(?<name>[a-z][a-z0-9_-]*)\s+(?<attrs>{(?:[^}]+|}+(?=})|(?!}\s+-->).)*?}\s+)?(?<void>\/)?-->/s',
+			'/<!--\s+(?<closer>\/)?wp:(?<namespace>[a-z][a-z0-9_-]*\/)?(?<name>[a-z][a-z0-9_-]*)\s+(?<attrs>{(?:(?:[^}]+|}+(?=})|(?!}\s+\/?-->).)*+)?}\s+)?(?<void>\/)?-->/s',
 			$this->document,
 			$matches,
 			PREG_OFFSET_CAPTURE,
 			$this->offset
 		);
 
+		// if we get here we probably have catastrophic backtracking or out-of-memory in the PCRE
+		if ( false === $has_match ) {
+			return array( 'no-more-tokens', null, null, null, null );
+		}
+
 		// we have no more tokens
 		if ( 0 === $has_match ) {
 			return array( 'no-more-tokens', null, null, null, null );

diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js
@@ -2,7 +2,49 @@ let document;
 let offset;
 let output;
 let stack;
-const tokenizer = /<!--\s+(\/)?wp:([a-z][a-z0-9_-]*\/)?([a-z][a-z0-9_-]*)\s+({(?:[^}]+|}+(?=})|(?!}\s+-->)[^])*?}\s+)?(\/)?-->/g;
+
+/**
+ * Matches block comment delimiters
+ *
+ * While most of this pattern is straightforward the attribute parsing
+ * incorporates a tricks to make sure we don't choke on specific input
+ *
+ *  - since JavaScript has no possessive quantifier or atomic grouping
+ *    we are emulating it with a trick
+ *
+ *    we want a possessive quantifier or atomic group to prevent backtracking
+ *    on the `}`s should we fail to match the remainder of the pattern
+ *
+ *    we can emulate this with a positive lookahead and back reference
+ *    (a++)*c === ((?=(a+))\1)*c
+ *
+ *    let's examine an example:
+ *      - /(a+)*c/.test('aaaaaaaaaaaaad') fails after over 49,000 steps
+ *      - /(a++)*c/.test('aaaaaaaaaaaaad') fails after 85 steps
+ *      - /(?>a+)*c/.test('aaaaaaaaaaaaad') fails after 126 steps
+ *
+ *    this is because the possessive `++` and the atomic group `(?>)`
+ *    tell the engine that all those `a`s belong together as a single group
+ *    and so it won't split it up when stepping backwards to try and match
+ *
+ *    if we use /((?=(a+))\1)*c/ then we get the same behavior as the atomic group
+ *    or possessive and prevent the backtracking because the `a+` is matched but
+ *    not captured. thus, we find the long string of `a`s and remember it, then
+ *    reference it as a whole unit inside our pattern
+ *
+ *    @cite http://instanceof.me/post/52245507631/regex-emulate-atomic-grouping-with-lookahead
+ *    @cite http://blog.stevenlevithan.com/archives/mimic-atomic-groups
+ *    @cite https://javascript.info/regexp-infinite-backtracking-problem
+ *
+ *    once browsers reliably support atomic grouping or possessive
+ *    quantifiers natively we should remove this trick and simplify
+ *
+ * @type RegExp
+ *
+ * @since 3.8.0
+ * @since 4.6.1 added optimization to prevent backtracking on attribute parsing
+ */
+const tokenizer = /<!--\s+(\/)?wp:([a-z][a-z0-9_-]*\/)?([a-z][a-z0-9_-]*)\s+({(?:(?=([^}]+|}+(?=})|(?!}\s+\/?-->)[^])*)\5|[^]*?)}\s+)?(\/)?-->/g;
 
 function Block( blockName, attrs, innerBlocks, innerHTML, innerContent ) {
 	return {
@@ -189,7 +231,7 @@ function nextToken() {
 	}
 
 	const startedAt = matches.index;
-	const [ match, closerMatch, namespaceMatch, nameMatch, attrsMatch, voidMatch ] = matches;
+	const [ match, closerMatch, namespaceMatch, nameMatch, attrsMatch, /* internal/unused */, voidMatch ] = matches;
 
 	const length = match.length;
 	const isCloser = !! closerMatch;

diff --git a/packages/block-serialization-spec-parser/shared-tests.js b/packages/block-serialization-spec-parser/shared-tests.js
@@ -155,6 +155,15 @@ export const jsTester = ( parse ) => () => {
 			expect( () => parsed = parse( `<!-- wp:fake {"a":"${ as }"} /-->` )[ 0 ] ).not.toThrow();
 			expect( parsed.attrs.a ).toHaveLength( length );
 		} );
+
+		describe( 'invalid block comment syntax', () => {
+			test( 'extra space after void closer', () => {
+				let parsed;
+
+				expect( () => parsed = parse( '<!-- wp:block / -->' )[ 0 ] ).not.toThrow();
+				expect( parsed.blockName ).toBeNull();
+			} );
+		} );
 	} );
 };