Skip to content

Commit

Permalink
Match URI scheme names that are greater than 9 characters, and allow …
Browse files Browse the repository at this point in the history
…for digits, +, ., and - in the scheme name. (Ex: 'chrome-extension:')
  • Loading branch information
gregjacobs committed Nov 17, 2014
1 parent 55e4122 commit 2479a45
Show file tree
Hide file tree
Showing 7 changed files with 274 additions and 104 deletions.
61 changes: 39 additions & 22 deletions dist/Autolinker.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

/*!
* Autolinker.js
* 0.13.1
* 0.14.0
*
* Copyright(c) 2014 Gregory Jacobs <[email protected]>
* MIT Licensed. http://www.opensource.org/licenses/mit-license.php
Expand Down Expand Up @@ -243,7 +243,7 @@

emailRegex = /(?:[\-;:&=\+\$,\w\.]+@)/, // something@ for email addresses (a.k.a. local-part)

protocolRegex = /(?:[A-Za-z]{3,9}:(?![A-Za-z]{3,9}:\/\/)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:")
protocolRegex = /(?:[A-Za-z][-.+A-Za-z0-9]+:(?![A-Za-z][-.+A-Za-z0-9]+:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
wwwRegex = /(?:www\.)/, // starting with 'www.'
domainNameRegex = /[A-Za-z0-9\.\-]*[A-Za-z0-9\-]/, // anything looking at all like a domain, non-unicode domains, not ending in a period
tldRegex = /\.(?:international|construction|contractors|enterprises|photography|productions|foundation|immobilien|industries|management|properties|technology|christmas|community|directory|education|equipment|institute|marketing|solutions|vacations|bargains|boutique|builders|catering|cleaning|clothing|computer|democrat|diamonds|graphics|holdings|lighting|partners|plumbing|supplies|training|ventures|academy|careers|company|cruises|domains|exposed|flights|florist|gallery|guitars|holiday|kitchen|neustar|okinawa|recipes|rentals|reviews|shiksha|singles|support|systems|agency|berlin|camera|center|coffee|condos|dating|estate|events|expert|futbol|kaufen|luxury|maison|monash|museum|nagoya|photos|repair|report|social|supply|tattoo|tienda|travel|viajes|villas|vision|voting|voyage|actor|build|cards|cheap|codes|dance|email|glass|house|mango|ninja|parts|photo|shoes|solar|today|tokyo|tools|watch|works|aero|arpa|asia|best|bike|blue|buzz|camp|club|cool|coop|farm|fish|gift|guru|info|jobs|kiwi|kred|land|limo|link|menu|mobi|moda|name|pics|pink|post|qpon|rich|ruhr|sexy|tips|vote|voto|wang|wien|wiki|zone|bar|bid|biz|cab|cat|ceo|com|edu|gov|int|kim|mil|net|onl|org|pro|pub|red|tel|uno|wed|xxx|xyz|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)\b/, // match our known top level domains (TLDs)
Expand Down Expand Up @@ -563,7 +563,8 @@
match = new Autolinker.match.Url( {
matchedText : matchStr,
url : matchStr,
protocolRelativeMatch : protocolRelativeMatch,
protocolUrlMatch : !!protocolUrlMatch,
protocolRelativeMatch : !!protocolRelativeMatch,
stripPrefix : this.stripPrefix
} );
}
Expand Down Expand Up @@ -1318,23 +1319,23 @@
* @private
* @property {RegExp} hasFullProtocolRegex
*/
hasFullProtocolRegex : /^[A-Za-z]{3,9}:\/\//,
hasFullProtocolRegex : /^[A-Za-z][-.+A-Za-z0-9]+:\/\//,

/**
* Regex to test for a protocol prefix, such as 'mailto:'
*
* @private
* @property {RegExp} hasProtocolPrefixRegex
*/
hasProtocolPrefixRegex : /^[A-Za-z]{3,9}:/,
hasProtocolPrefixRegex : /^[A-Za-z][-.+A-Za-z0-9]+:/,

/**
* Regex to determine if at least one word char exists after the protocol (i.e. after the ':')
*
* @private
* @property {RegExp} hasWordCharAfterProtocolRegex
*/
hasWordCharAfterProtocolRegex : /:.*?[A-Za-z]/,
hasWordCharAfterProtocolRegex : /:[^\s]*?[A-Za-z]/,


/**
Expand All @@ -1360,9 +1361,9 @@
*/
isValidMatch : function( urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
if(
this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
this.isInvalidProtocolRelativeMatch( protocolRelativeMatch ) // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch, protocolUrlMatch ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
this.isInvalidProtocolRelativeMatch( protocolRelativeMatch ) // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
) {
return false;
}
Expand All @@ -1388,7 +1389,7 @@
* match.
*/
urlMatchDoesNotHaveProtocolOrDot : function( urlMatch, protocolUrlMatch ) {
return ( urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
return ( !!urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
},


Expand All @@ -1400,11 +1401,18 @@
*
* @private
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
* @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to
* know whether or not we have a protocol in the URL string, in order to check for a word character after the protocol
* separator (':').
* @return {Boolean} `true` if the URL match does not have at least one word character in it after the protocol, `false`
* otherwise.
*/
urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch ) {
return ( urlMatch && this.hasProtocolPrefixRegex.test( urlMatch ) && !this.hasWordCharAfterProtocolRegex.test( urlMatch ) );
urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch, protocolUrlMatch ) {
if( urlMatch && protocolUrlMatch ) {
return !this.hasWordCharAfterProtocolRegex.test( urlMatch );
} else {
return false;
}
},


Expand All @@ -1420,7 +1428,7 @@
* @return {Boolean} `true` if it is an invalid protocol-relative match, `false` otherwise.
*/
isInvalidProtocolRelativeMatch : function( protocolRelativeMatch ) {
return ( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
return ( !!protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
}

} );
Expand Down Expand Up @@ -1785,6 +1793,13 @@
* The url that was matched.
*/

/**
* @cfg {Boolean} protocolUrlMatch (required)
*
* `true` if the URL is a match which already has a protocol (i.e. 'http://'), `false` if the match was from a 'www' or
* known TLD match.
*/

/**
* @cfg {Boolean} protocolRelativeMatch (required)
*
Expand Down Expand Up @@ -1816,13 +1831,13 @@
protocolRelativeRegex : /^\/\//,

/**
* @protected
* @property {RegExp} checkForProtocolRegex
* @private
* @property {Boolean} protocolPrepended
*
* A regular expression used to check if the {@link #url} is missing a protocol (in which case, 'http://'
* will be added).
* Will be set to `true` if the 'http://' protocol has been prepended to the {@link #url} (because the
* {@link #url} did not have a protocol)
*/
checkForProtocolRegex: /^[A-Za-z]{3,9}:/,
protocolPrepended : false,


/**
Expand All @@ -1836,17 +1851,19 @@


/**
* Returns the url that was matched, assuming the protocol to be 'http://' if the match
* was missing a protocol.
* Returns the url that was matched, assuming the protocol to be 'http://' if the original
* match was missing a protocol.
*
* @return {String}
*/
getUrl : function() {
var url = this.url;

// if the url string doesn't begin with a protocol, assume http://
if( !this.protocolRelativeMatch && !this.checkForProtocolRegex.test( url ) ) {
// if the url string doesn't begin with a protocol, assume 'http://'
if( !this.protocolRelativeMatch && !this.protocolUrlMatch && !this.protocolPrepended ) {
url = this.url = 'http://' + url;

this.protocolPrepended = true;
}

return url;
Expand Down
4 changes: 2 additions & 2 deletions dist/Autolinker.min.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "autolinker",
"version": "0.13.1",
"version": "0.14.0",
"description": "Utility to automatically link the URLs, email addresses, and Twitter handles in a given block of text/HTML",
"main": "dist/Autolinker.js",
"directories": {
Expand Down
5 changes: 3 additions & 2 deletions src/Autolinker.js
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ Autolinker.prototype = {

emailRegex = /(?:[\-;:&=\+\$,\w\.]+@)/, // something@ for email addresses (a.k.a. local-part)

protocolRegex = /(?:[A-Za-z]{3,9}:(?![A-Za-z]{3,9}:\/\/)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:")
protocolRegex = /(?:[A-Za-z][-.+A-Za-z0-9]+:(?![A-Za-z][-.+A-Za-z0-9]+:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
wwwRegex = /(?:www\.)/, // starting with 'www.'
domainNameRegex = /[A-Za-z0-9\.\-]*[A-Za-z0-9\-]/, // anything looking at all like a domain, non-unicode domains, not ending in a period
tldRegex = /\.(?:international|construction|contractors|enterprises|photography|productions|foundation|immobilien|industries|management|properties|technology|christmas|community|directory|education|equipment|institute|marketing|solutions|vacations|bargains|boutique|builders|catering|cleaning|clothing|computer|democrat|diamonds|graphics|holdings|lighting|partners|plumbing|supplies|training|ventures|academy|careers|company|cruises|domains|exposed|flights|florist|gallery|guitars|holiday|kitchen|neustar|okinawa|recipes|rentals|reviews|shiksha|singles|support|systems|agency|berlin|camera|center|coffee|condos|dating|estate|events|expert|futbol|kaufen|luxury|maison|monash|museum|nagoya|photos|repair|report|social|supply|tattoo|tienda|travel|viajes|villas|vision|voting|voyage|actor|build|cards|cheap|codes|dance|email|glass|house|mango|ninja|parts|photo|shoes|solar|today|tokyo|tools|watch|works|aero|arpa|asia|best|bike|blue|buzz|camp|club|cool|coop|farm|fish|gift|guru|info|jobs|kiwi|kred|land|limo|link|menu|mobi|moda|name|pics|pink|post|qpon|rich|ruhr|sexy|tips|vote|voto|wang|wien|wiki|zone|bar|bid|biz|cab|cat|ceo|com|edu|gov|int|kim|mil|net|onl|org|pro|pub|red|tel|uno|wed|xxx|xyz|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)\b/, // match our known top level domains (TLDs)
Expand Down Expand Up @@ -538,7 +538,8 @@ Autolinker.prototype = {
match = new Autolinker.match.Url( {
matchedText : matchStr,
url : matchStr,
protocolRelativeMatch : protocolRelativeMatch,
protocolUrlMatch : !!protocolUrlMatch,
protocolRelativeMatch : !!protocolRelativeMatch,
stripPrefix : this.stripPrefix
} );
}
Expand Down
27 changes: 17 additions & 10 deletions src/MatchValidator.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,23 @@ Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
* @private
* @property {RegExp} hasFullProtocolRegex
*/
hasFullProtocolRegex : /^[A-Za-z]{3,9}:\/\//,
hasFullProtocolRegex : /^[A-Za-z][-.+A-Za-z0-9]+:\/\//,

/**
* Regex to test for a protocol prefix, such as 'mailto:'
*
* @private
* @property {RegExp} hasProtocolPrefixRegex
*/
hasProtocolPrefixRegex : /^[A-Za-z]{3,9}:/,
hasProtocolPrefixRegex : /^[A-Za-z][-.+A-Za-z0-9]+:/,

/**
* Regex to determine if at least one word char exists after the protocol (i.e. after the ':')
*
* @private
* @property {RegExp} hasWordCharAfterProtocolRegex
*/
hasWordCharAfterProtocolRegex : /:.*?[A-Za-z]/,
hasWordCharAfterProtocolRegex : /:[^\s]*?[A-Za-z]/,


/**
Expand All @@ -76,9 +76,9 @@ Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
*/
isValidMatch : function( urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
if(
this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
this.isInvalidProtocolRelativeMatch( protocolRelativeMatch ) // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch, protocolUrlMatch ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
this.isInvalidProtocolRelativeMatch( protocolRelativeMatch ) // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
) {
return false;
}
Expand All @@ -104,7 +104,7 @@ Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
* match.
*/
urlMatchDoesNotHaveProtocolOrDot : function( urlMatch, protocolUrlMatch ) {
return ( urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
return ( !!urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
},


Expand All @@ -116,11 +116,18 @@ Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
*
* @private
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
* @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to
* know whether or not we have a protocol in the URL string, in order to check for a word character after the protocol
* separator (':').
* @return {Boolean} `true` if the URL match does not have at least one word character in it after the protocol, `false`
* otherwise.
*/
urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch ) {
return ( urlMatch && this.hasProtocolPrefixRegex.test( urlMatch ) && !this.hasWordCharAfterProtocolRegex.test( urlMatch ) );
urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch, protocolUrlMatch ) {
if( urlMatch && protocolUrlMatch ) {
return !this.hasWordCharAfterProtocolRegex.test( urlMatch );
} else {
return false;
}
},


Expand All @@ -136,7 +143,7 @@ Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
* @return {Boolean} `true` if it is an invalid protocol-relative match, `false` otherwise.
*/
isInvalidProtocolRelativeMatch : function( protocolRelativeMatch ) {
return ( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
return ( !!protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
}

} );
27 changes: 18 additions & 9 deletions src/match/Url.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ Autolinker.match.Url = Autolinker.Util.extend( Autolinker.match.Match, {
* The url that was matched.
*/

/**
* @cfg {Boolean} protocolUrlMatch (required)
*
* `true` if the URL is a match which already has a protocol (i.e. 'http://'), `false` if the match was from a 'www' or
* known TLD match.
*/

/**
* @cfg {Boolean} protocolRelativeMatch (required)
*
Expand Down Expand Up @@ -46,13 +53,13 @@ Autolinker.match.Url = Autolinker.Util.extend( Autolinker.match.Match, {
protocolRelativeRegex : /^\/\//,

/**
* @protected
* @property {RegExp} checkForProtocolRegex
* @private
* @property {Boolean} protocolPrepended
*
* A regular expression used to check if the {@link #url} is missing a protocol (in which case, 'http://'
* will be added).
* Will be set to `true` if the 'http://' protocol has been prepended to the {@link #url} (because the
* {@link #url} did not have a protocol)
*/
checkForProtocolRegex: /^[A-Za-z]{3,9}:/,
protocolPrepended : false,


/**
Expand All @@ -66,17 +73,19 @@ Autolinker.match.Url = Autolinker.Util.extend( Autolinker.match.Match, {


/**
* Returns the url that was matched, assuming the protocol to be 'http://' if the match
* was missing a protocol.
* Returns the url that was matched, assuming the protocol to be 'http://' if the original
* match was missing a protocol.
*
* @return {String}
*/
getUrl : function() {
var url = this.url;

// if the url string doesn't begin with a protocol, assume http://
if( !this.protocolRelativeMatch && !this.checkForProtocolRegex.test( url ) ) {
// if the url string doesn't begin with a protocol, assume 'http://'
if( !this.protocolRelativeMatch && !this.protocolUrlMatch && !this.protocolPrepended ) {
url = this.url = 'http://' + url;

this.protocolPrepended = true;
}

return url;
Expand Down
Loading

0 comments on commit 2479a45

Please sign in to comment.