Fix for <!DOCTYPE> tag html parsing, which could cause the regex engi…

…ne to freeze with 100% cpu for certain inputs. Add fix for proper handling of <A> tags (with capitalized tag name) as well.
gregjacobs · Nov 24, 2014 · 6335974 · 6335974
1 parent 3bedddc
commit 6335974
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 52 deletions.
diff --git a/dist/Autolinker.js b/dist/Autolinker.js
@@ -16,7 +16,7 @@
 
 	/*!
 	 * Autolinker.js
-	 * 0.14.1
+	 * 0.15.0
 	 *
 	 * Copyright(c) 2014 Gregory Jacobs <[email protected]>
 	 * MIT Licensed. http://www.opensource.org/licenses/mit-license.php
@@ -850,34 +850,53 @@
 		 * 
 		 * Capturing groups:
 		 * 
-		 * 1. If it is an end tag, this group will have the '/'.
-		 * 2. The tag name.
+		 * 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
+		 * 2. If it is an end tag, this group will have the '/'.
+		 * 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
 		 */
 		htmlRegex : (function() {
-			var tagNameRegex = /[0-9a-zA-Z:]+/,
+			var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
 			    attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/,   // the unicode range accounts for excluding control chars, and the delete char
 			    attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
 			    nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?';  // optional '=[value]'
 
 			return new RegExp( [
-				'<(?:!|(/))?',  // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
+				// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
+				'(?:',
+					'<(!DOCTYPE)',  // *** Capturing Group 1 - If it's a doctype tag
+
+						// Zero or more attributes following the tag name
+						'(?:',
+							'\\s+',  // one or more whitespace chars before an attribute
+
+							// Either:
+							// A. attr="value", or 
+							// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
+							'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
+						')*',
+					'>',
+				')',
+
+				'|',
 
-					// The tag name (Capturing Group 2)
-					'(' + tagNameRegex.source + ')',
+				// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
+				'(?:',
+					'<(/)?',  // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag. 
+					          // *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
 
-					// Zero or more attributes following the tag name
-					'(?:',
-						'\\s+',  // one or more whitespace chars before an attribute
+						// *** Capturing Group 3 - The tag name
+						'(' + tagNameRegex.source + ')',
 
-						// Either:
-						// A. tag="value", or 
-						// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
-						'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
-					')*',
+						// Zero or more attributes following the tag name
+						'(?:',
+							'\\s+',                // one or more whitespace chars before an attribute
+							nameEqualsValueRegex,  // attr="value" (with optional ="value" part)
+						')*',
 
-					'\\s*/?',  // any trailing spaces and optional '/' before the closing '>'
-				'>'
-			].join( "" ), 'g' );
+						'\\s*/?',  // any trailing spaces and optional '/' before the closing '>'
+					'>',
+				')'
+			].join( "" ), 'gi' );
 		} )(),
 
 
@@ -911,15 +930,15 @@
 			// wrapping the URLs in anchor tags
 			while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
 				var tagText = currentResult[ 0 ],
-				    tagName = currentResult[ 2 ],
-				    isClosingTag = !!currentResult[ 1 ],
+				    tagName = currentResult[ 1 ] || currentResult[ 3 ],  // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a") 
+				    isClosingTag = !!currentResult[ 2 ],
 				    inBetweenTagsText = html.substring( lastIndex, currentResult.index );
 
 				if( inBetweenTagsText ) {
 					processTextNodeVisitor( inBetweenTagsText );
 				}
 
-				processHtmlNodeVisitor( tagText, tagName, isClosingTag );
+				processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );
 
 				lastIndex = currentResult.index + tagText.length;
 			}

diff --git a/dist/Autolinker.min.js b/dist/Autolinker.min.js
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "autolinker",
-  "version": "0.14.1",
+  "version": "0.15.0",
   "description": "Utility to automatically link the URLs, email addresses, and Twitter handles in a given block of text/HTML",
   "main": "dist/Autolinker.js",
   "directories": {

diff --git a/src/HtmlParser.js b/src/HtmlParser.js
@@ -20,34 +20,53 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
 	 * 
 	 * Capturing groups:
 	 * 
-	 * 1. If it is an end tag, this group will have the '/'.
-	 * 2. The tag name.
+	 * 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
+	 * 2. If it is an end tag, this group will have the '/'.
+	 * 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
 	 */
 	htmlRegex : (function() {
-		var tagNameRegex = /[0-9a-zA-Z:]+/,
+		var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
 		    attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/,   // the unicode range accounts for excluding control chars, and the delete char
 		    attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
 		    nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?';  // optional '=[value]'
 
 		return new RegExp( [
-			'<(?:!|(/))?',  // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
+			// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
+			'(?:',
+				'<(!DOCTYPE)',  // *** Capturing Group 1 - If it's a doctype tag
+
+					// Zero or more attributes following the tag name
+					'(?:',
+						'\\s+',  // one or more whitespace chars before an attribute
+
+						// Either:
+						// A. attr="value", or 
+						// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
+						'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
+					')*',
+				'>',
+			')',
+
+			'|',
 
-				// The tag name (Capturing Group 2)
-				'(' + tagNameRegex.source + ')',
-
-				// Zero or more attributes following the tag name
-				'(?:',
-					'\\s+',  // one or more whitespace chars before an attribute
+			// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
+			'(?:',
+				'<(/)?',  // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag. 
+				          // *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
+
+					// *** Capturing Group 3 - The tag name
+					'(' + tagNameRegex.source + ')',
+
+					// Zero or more attributes following the tag name
+					'(?:',
+						'\\s+',                // one or more whitespace chars before an attribute
+						nameEqualsValueRegex,  // attr="value" (with optional ="value" part)
+					')*',
 
-					// Either:
-					// A. tag="value", or 
-					// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) 
-					'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
-				')*',
-
-				'\\s*/?',  // any trailing spaces and optional '/' before the closing '>'
-			'>'
-		].join( "" ), 'g' );
+					'\\s*/?',  // any trailing spaces and optional '/' before the closing '>'
+				'>',
+			')'
+		].join( "" ), 'gi' );
 	} )(),
 
 
@@ -81,15 +100,15 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
 		// wrapping the URLs in anchor tags
 		while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
 			var tagText = currentResult[ 0 ],
-			    tagName = currentResult[ 2 ],
-			    isClosingTag = !!currentResult[ 1 ],
+			    tagName = currentResult[ 1 ] || currentResult[ 3 ],  // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a") 
+			    isClosingTag = !!currentResult[ 2 ],
 			    inBetweenTagsText = html.substring( lastIndex, currentResult.index );
 
 			if( inBetweenTagsText ) {
 				processTextNodeVisitor( inBetweenTagsText );
 			}
 
-			processHtmlNodeVisitor( tagText, tagName, isClosingTag );
+			processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );
 
 			lastIndex = currentResult.index + tagText.length;
 		}

diff --git a/tests/AutolinkerSpec.js b/tests/AutolinkerSpec.js
@@ -850,6 +850,14 @@ describe( "Autolinker", function() {
 			} );
 
 
+			it( "should autolink the link, and not fail with 100% cpu in the Regex engine when presented with the input in issue #54", function() {
+				var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
+				    result = autolinker.link( inputStr );
+
+				expect( result ).toBe( 'Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: <a href="http://dorcoshai.de/pb1205ro">dorcoshai.de/pb1205ro</a>, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)' );
+			} );
+
+
 			it( "should NOT modify the email address with other tags when inside another anchor", function() {
 				var input = [
 					'<div>First name: Subin</div>',

diff --git a/tests/HtmlParserSpec.js b/tests/HtmlParserSpec.js
@@ -1,14 +1,18 @@
 /*global Autolinker, _, describe, beforeEach, afterEach, it, expect */
 describe( "Autolinker.HtmlParser", function() {
-	var HtmlParser = Autolinker.HtmlParser;
+	var HtmlParser = Autolinker.HtmlParser,
+	    htmlParser;
+
+
+	beforeEach( function() {
+		htmlParser = new HtmlParser();
+	} );
 
 
 	it( "should be able to reproduce the input string based on storing the results of the visitor function calls", function() {
-		var htmlParser = new HtmlParser(),
-		    inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
+		var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
 		    result = [];
 
-
 		htmlParser.parse( inputStr, {
 			processHtmlNode : function( tagText, tagName, isClosingTag ) {
 				result.push( tagText );
@@ -23,8 +27,7 @@ describe( "Autolinker.HtmlParser", function() {
 
 
 	it( "should properly call the visitor functions for each text / html node encountered, with the proper arguments", function() {
-		var htmlParser = new HtmlParser(),
-		    inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
+		var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
 		    htmlNodeArgs = [],
 		    textNodeArgs = [];
 
@@ -51,4 +54,65 @@ describe( "Autolinker.HtmlParser", function() {
 		expect( textNodeArgs[ 4 ] ).toEqual( [ ' items' ] );
 	} );
 
+
+	it( 'should match tags of both upper and lower case', function() {
+		var inputStr = 'Joe <!DOCTYPE html> went <!doctype "blah" "blah blah"> to <a href="google.com">ebay.com</a> today, and <A href="purchase.com">purchased</A> <b>big</b> <B>items</B>',
+		    htmlNodeArgs = [],
+		    textNodeArgs = [];
+
+		htmlParser.parse( inputStr, {
+			processHtmlNode : function( tagText, tagName, isClosingTag ) {
+				htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
+			},
+			processTextNode : function( text ) {
+				textNodeArgs.push( Array.prototype.slice.call( arguments ) );
+			}
+		} );
+
+		expect( htmlNodeArgs.length ).toBe( 10 );
+		expect( htmlNodeArgs[ 0 ] ).toEqual( [ '<!DOCTYPE html>', '!doctype', false ] );
+		expect( htmlNodeArgs[ 1 ] ).toEqual( [ '<!doctype "blah" "blah blah">', '!doctype', false ] );
+		expect( htmlNodeArgs[ 2 ] ).toEqual( [ '<a href="google.com">', 'a', false ] );
+		expect( htmlNodeArgs[ 3 ] ).toEqual( [ '</a>', 'a', true ] );
+		expect( htmlNodeArgs[ 4 ] ).toEqual( [ '<A href="purchase.com">', 'a', false ] );
+		expect( htmlNodeArgs[ 5 ] ).toEqual( [ '</A>', 'a', true ] );
+		expect( htmlNodeArgs[ 6 ] ).toEqual( [ '<b>', 'b', false ] );
+		expect( htmlNodeArgs[ 7 ] ).toEqual( [ '</b>', 'b', true ] );
+		expect( htmlNodeArgs[ 8 ] ).toEqual( [ '<B>', 'b', false ] );
+		expect( htmlNodeArgs[ 9 ] ).toEqual( [ '</B>', 'b', true ] );
+
+		expect( textNodeArgs.length ).toBe( 10 );
+		expect( textNodeArgs[ 0 ] ).toEqual( [ 'Joe ' ] );
+		expect( textNodeArgs[ 1 ] ).toEqual( [ ' went ' ] );
+		expect( textNodeArgs[ 2 ] ).toEqual( [ ' to ' ] );
+		expect( textNodeArgs[ 3 ] ).toEqual( [ 'ebay.com' ] );
+		expect( textNodeArgs[ 4 ] ).toEqual( [ ' today, and ' ] );
+		expect( textNodeArgs[ 5 ] ).toEqual( [ 'purchased' ] );
+		expect( textNodeArgs[ 6 ] ).toEqual( [ ' ' ] );
+		expect( textNodeArgs[ 7 ] ).toEqual( [ 'big' ] );
+		expect( textNodeArgs[ 8 ] ).toEqual( [ ' ' ] );
+		expect( textNodeArgs[ 9 ] ).toEqual( [ 'items' ] );
+	} );
+
+
+	it( "should not freeze up the regular expression engine when presented with the input string in issue #54", function() {
+		var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! 'Gefallt mir' klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
+		    htmlNodeArgs = [],
+		    textNodeArgs = [];
+
+		htmlParser.parse( inputStr, {
+			processHtmlNode : function( tagText, tagName, isClosingTag ) {
+				htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
+			},
+			processTextNode : function( text ) {
+				textNodeArgs.push( Array.prototype.slice.call( arguments ) );
+			}
+		} );
+
+		expect( htmlNodeArgs.length ).toBe( 0 );
+
+		expect( textNodeArgs.length ).toBe( 1 );
+		expect( textNodeArgs[ 0 ] ).toEqual( [ inputStr ] );
+	} );
+
 } );