Skip to content

Commit

Permalink
Fix for <!DOCTYPE> tag html parsing, which could cause the regex engi…
Browse files Browse the repository at this point in the history
…ne to freeze with 100% cpu for certain inputs.

Add fix for proper handling of <A> tags (with capitalized tag name) as well.
  • Loading branch information
gregjacobs committed Nov 24, 2014
1 parent 3bedddc commit 6335974
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 52 deletions.
61 changes: 40 additions & 21 deletions dist/Autolinker.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

/*!
* Autolinker.js
* 0.14.1
* 0.15.0
*
* Copyright(c) 2014 Gregory Jacobs <[email protected]>
* MIT Licensed. http://www.opensource.org/licenses/mit-license.php
Expand Down Expand Up @@ -850,34 +850,53 @@
*
* Capturing groups:
*
* 1. If it is an end tag, this group will have the '/'.
* 2. The tag name.
* 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
* 2. If it is an end tag, this group will have the '/'.
* 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
*/
htmlRegex : (function() {
var tagNameRegex = /[0-9a-zA-Z:]+/,
var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]'

return new RegExp( [
'<(?:!|(/))?', // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:',
'<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag

// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute

// Either:
// A. attr="value", or
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
')*',
'>',
')',

'|',

// The tag name (Capturing Group 2)
'(' + tagNameRegex.source + ')',
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
'(?:',
'<(/)?', // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag.
// *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.

// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
// *** Capturing Group 3 - The tag name
'(' + tagNameRegex.source + ')',

// Either:
// A. tag="value", or
// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
')*',
// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
nameEqualsValueRegex, // attr="value" (with optional ="value" part)
')*',

'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
'>'
].join( "" ), 'g' );
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
'>',
')'
].join( "" ), 'gi' );
} )(),


Expand Down Expand Up @@ -911,15 +930,15 @@
// wrapping the URLs in anchor tags
while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
var tagText = currentResult[ 0 ],
tagName = currentResult[ 2 ],
isClosingTag = !!currentResult[ 1 ],
tagName = currentResult[ 1 ] || currentResult[ 3 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a")
isClosingTag = !!currentResult[ 2 ],
inBetweenTagsText = html.substring( lastIndex, currentResult.index );

if( inBetweenTagsText ) {
processTextNodeVisitor( inBetweenTagsText );
}

processHtmlNodeVisitor( tagText, tagName, isClosingTag );
processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );

lastIndex = currentResult.index + tagText.length;
}
Expand Down
4 changes: 2 additions & 2 deletions dist/Autolinker.min.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "autolinker",
"version": "0.14.1",
"version": "0.15.0",
"description": "Utility to automatically link the URLs, email addresses, and Twitter handles in a given block of text/HTML",
"main": "dist/Autolinker.js",
"directories": {
Expand Down
63 changes: 41 additions & 22 deletions src/HtmlParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,53 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
*
* Capturing groups:
*
* 1. If it is an end tag, this group will have the '/'.
* 2. The tag name.
* 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
* 2. If it is an end tag, this group will have the '/'.
* 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
*/
htmlRegex : (function() {
var tagNameRegex = /[0-9a-zA-Z:]+/,
var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]'

return new RegExp( [
'<(?:!|(/))?', // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:',
'<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag

// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute

// Either:
// A. attr="value", or
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
')*',
'>',
')',

'|',

// The tag name (Capturing Group 2)
'(' + tagNameRegex.source + ')',

// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
'(?:',
'<(/)?', // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag.
// *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.

// *** Capturing Group 3 - The tag name
'(' + tagNameRegex.source + ')',

// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
nameEqualsValueRegex, // attr="value" (with optional ="value" part)
')*',

// Either:
// A. tag="value", or
// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
')*',

'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
'>'
].join( "" ), 'g' );
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
'>',
')'
].join( "" ), 'gi' );
} )(),


Expand Down Expand Up @@ -81,15 +100,15 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
// wrapping the URLs in anchor tags
while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
var tagText = currentResult[ 0 ],
tagName = currentResult[ 2 ],
isClosingTag = !!currentResult[ 1 ],
tagName = currentResult[ 1 ] || currentResult[ 3 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a")
isClosingTag = !!currentResult[ 2 ],
inBetweenTagsText = html.substring( lastIndex, currentResult.index );

if( inBetweenTagsText ) {
processTextNodeVisitor( inBetweenTagsText );
}

processHtmlNodeVisitor( tagText, tagName, isClosingTag );
processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );

lastIndex = currentResult.index + tagText.length;
}
Expand Down
8 changes: 8 additions & 0 deletions tests/AutolinkerSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,14 @@ describe( "Autolinker", function() {
} );


it( "should autolink the link, and not fail with 100% cpu in the Regex engine when presented with the input in issue #54", function() {
var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
result = autolinker.link( inputStr );

expect( result ).toBe( 'Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: <a href="http://dorcoshai.de/pb1205ro">dorcoshai.de/pb1205ro</a>, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)' );
} );


it( "should NOT modify the email address with other tags when inside another anchor", function() {
var input = [
'<div>First name: Subin</div>',
Expand Down
76 changes: 70 additions & 6 deletions tests/HtmlParserSpec.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
/*global Autolinker, _, describe, beforeEach, afterEach, it, expect */
describe( "Autolinker.HtmlParser", function() {
var HtmlParser = Autolinker.HtmlParser;
var HtmlParser = Autolinker.HtmlParser,
htmlParser;


beforeEach( function() {
htmlParser = new HtmlParser();
} );


it( "should be able to reproduce the input string based on storing the results of the visitor function calls", function() {
var htmlParser = new HtmlParser(),
inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
result = [];


htmlParser.parse( inputStr, {
processHtmlNode : function( tagText, tagName, isClosingTag ) {
result.push( tagText );
Expand All @@ -23,8 +27,7 @@ describe( "Autolinker.HtmlParser", function() {


it( "should properly call the visitor functions for each text / html node encountered, with the proper arguments", function() {
var htmlParser = new HtmlParser(),
inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
htmlNodeArgs = [],
textNodeArgs = [];

Expand All @@ -51,4 +54,65 @@ describe( "Autolinker.HtmlParser", function() {
expect( textNodeArgs[ 4 ] ).toEqual( [ ' items' ] );
} );


it( 'should match tags of both upper and lower case', function() {
var inputStr = 'Joe <!DOCTYPE html> went <!doctype "blah" "blah blah"> to <a href="google.com">ebay.com</a> today, and <A href="purchase.com">purchased</A> <b>big</b> <B>items</B>',
htmlNodeArgs = [],
textNodeArgs = [];

htmlParser.parse( inputStr, {
processHtmlNode : function( tagText, tagName, isClosingTag ) {
htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
},
processTextNode : function( text ) {
textNodeArgs.push( Array.prototype.slice.call( arguments ) );
}
} );

expect( htmlNodeArgs.length ).toBe( 10 );
expect( htmlNodeArgs[ 0 ] ).toEqual( [ '<!DOCTYPE html>', '!doctype', false ] );
expect( htmlNodeArgs[ 1 ] ).toEqual( [ '<!doctype "blah" "blah blah">', '!doctype', false ] );
expect( htmlNodeArgs[ 2 ] ).toEqual( [ '<a href="google.com">', 'a', false ] );
expect( htmlNodeArgs[ 3 ] ).toEqual( [ '</a>', 'a', true ] );
expect( htmlNodeArgs[ 4 ] ).toEqual( [ '<A href="purchase.com">', 'a', false ] );
expect( htmlNodeArgs[ 5 ] ).toEqual( [ '</A>', 'a', true ] );
expect( htmlNodeArgs[ 6 ] ).toEqual( [ '<b>', 'b', false ] );
expect( htmlNodeArgs[ 7 ] ).toEqual( [ '</b>', 'b', true ] );
expect( htmlNodeArgs[ 8 ] ).toEqual( [ '<B>', 'b', false ] );
expect( htmlNodeArgs[ 9 ] ).toEqual( [ '</B>', 'b', true ] );

expect( textNodeArgs.length ).toBe( 10 );
expect( textNodeArgs[ 0 ] ).toEqual( [ 'Joe ' ] );
expect( textNodeArgs[ 1 ] ).toEqual( [ ' went ' ] );
expect( textNodeArgs[ 2 ] ).toEqual( [ ' to ' ] );
expect( textNodeArgs[ 3 ] ).toEqual( [ 'ebay.com' ] );
expect( textNodeArgs[ 4 ] ).toEqual( [ ' today, and ' ] );
expect( textNodeArgs[ 5 ] ).toEqual( [ 'purchased' ] );
expect( textNodeArgs[ 6 ] ).toEqual( [ ' ' ] );
expect( textNodeArgs[ 7 ] ).toEqual( [ 'big' ] );
expect( textNodeArgs[ 8 ] ).toEqual( [ ' ' ] );
expect( textNodeArgs[ 9 ] ).toEqual( [ 'items' ] );
} );


it( "should not freeze up the regular expression engine when presented with the input string in issue #54", function() {
var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! 'Gefallt mir' klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
htmlNodeArgs = [],
textNodeArgs = [];

htmlParser.parse( inputStr, {
processHtmlNode : function( tagText, tagName, isClosingTag ) {
htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
},
processTextNode : function( text ) {
textNodeArgs.push( Array.prototype.slice.call( arguments ) );
}
} );

expect( htmlNodeArgs.length ).toBe( 0 );

expect( textNodeArgs.length ).toBe( 1 );
expect( textNodeArgs[ 0 ] ).toEqual( [ inputStr ] );
} );

} );

0 comments on commit 6335974

Please sign in to comment.