-
Notifications
You must be signed in to change notification settings - Fork 239
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #260 from gregjacobs/linear-time-email-matcher
Linear time email matcher
- Loading branch information
Showing
7 changed files
with
430 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,11 @@ | ||
import { Matcher } from "./matcher"; | ||
import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib"; | ||
import { tldRegex } from "./tld-regex"; | ||
import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib"; | ||
import { EmailMatch } from "../match/email-match"; | ||
import { Match } from "../match/match"; | ||
import { throwUnhandledCaseError } from '../utils'; | ||
|
||
// For debugging: search for other "For debugging" lines | ||
// import CliTable from 'cli-table'; | ||
|
||
/** | ||
* @class Autolinker.matcher.Email | ||
|
@@ -15,49 +18,245 @@ import { Match } from "../match/match"; | |
export class EmailMatcher extends Matcher { | ||
|
||
/** | ||
* The regular expression to match email addresses. Example match: | ||
* | ||
* [email protected] | ||
* | ||
* @protected | ||
* @property {RegExp} matcherRegex | ||
* Valid characters that can be used in the "local" part of an email address, | ||
* i.e. the "name" part of "[email protected]" | ||
*/ | ||
protected matcherRegex = (function() { | ||
var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~', | ||
restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]', | ||
validCharacters = alphaNumericAndMarksCharsStr + specialCharacters, | ||
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters, | ||
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@'); | ||
|
||
return new RegExp( [ | ||
emailRegex.source, | ||
getDomainNameStr( 1 ), | ||
'\\.', tldRegex.source // '.com', '.net', etc | ||
].join( "" ), 'gi' ); | ||
} )(); | ||
protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` ); | ||
|
||
|
||
/** | ||
* @inheritdoc | ||
*/ | ||
parseMatches( text: string ) { | ||
let matcherRegex = this.matcherRegex, | ||
tagBuilder = this.tagBuilder, | ||
matches: Match[] = [], | ||
match: RegExpExecArray | null; | ||
const tagBuilder = this.tagBuilder, | ||
localPartCharRegex = this.localPartCharRegex, | ||
matches: Match[] = [], | ||
len = text.length, | ||
noCurrentEmailAddress = new CurrentEmailAddress(); | ||
|
||
let charIdx = 0, | ||
state = State.NonEmailAddress as State, | ||
currentEmailAddress = noCurrentEmailAddress; | ||
|
||
// For debugging: search for other "For debugging" lines | ||
// const table = new CliTable( { | ||
// head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ] | ||
// } ); | ||
|
||
while( ( match = matcherRegex.exec( text ) ) !== null ) { | ||
let matchedText = match[ 0 ]; | ||
while( charIdx < len ) { | ||
const char = text.charAt( charIdx ); | ||
|
||
matches.push( new EmailMatch( { | ||
tagBuilder : tagBuilder, | ||
matchedText : matchedText, | ||
offset : match.index, | ||
email : matchedText | ||
} ) ); | ||
// For debugging: search for other "For debugging" lines | ||
// table.push( | ||
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ] | ||
// ); | ||
|
||
switch( state ) { | ||
case State.NonEmailAddress: stateNonEmailAddress( char ); break; | ||
case State.LocalPart: stateLocalPart( char ); break; | ||
case State.LocalPartDot: stateLocalPartDot( char ); break; | ||
case State.AtSign: stateAtSign( char ); break; | ||
case State.DomainChar: stateDomainChar( char ); break; | ||
case State.DomainHyphen: stateDomainHyphen( char ); break; | ||
case State.DomainDot: stateDomainDot( char ); break; | ||
|
||
default: | ||
throwUnhandledCaseError( state ); | ||
} | ||
|
||
// For debugging: search for other "For debugging" lines | ||
// table.push( | ||
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ] | ||
// ); | ||
|
||
charIdx++; | ||
} | ||
|
||
// Capture any valid match at the end of the string | ||
captureMatchIfValidAndReset(); | ||
|
||
// For debugging: search for other "For debugging" lines | ||
//console.log( '\n' + table.toString() ); | ||
|
||
return matches; | ||
|
||
|
||
// Handles the state when we're not in an email address | ||
function stateNonEmailAddress( char: string ) { | ||
if( localPartCharRegex.test( char ) ) { | ||
beginEmailAddress(); | ||
|
||
} else { | ||
// not an email address character, continue | ||
} | ||
} | ||
|
||
|
||
// Handles the state when we're currently in the "local part" of an | ||
// email address (as opposed to the "domain part") | ||
function stateLocalPart( char: string ) { | ||
if( char === '.' ) { | ||
state = State.LocalPartDot; | ||
|
||
} else if( char === '@' ) { | ||
state = State.AtSign; | ||
|
||
} else if( localPartCharRegex.test( char ) ) { | ||
// stay in the "local part" of the email address | ||
|
||
} else { | ||
// not an email address character, return to "NonEmailAddress" state | ||
resetToNonEmailAddressState(); | ||
} | ||
} | ||
|
||
|
||
// Handles the state where we've read | ||
function stateLocalPartDot( char: string ) { | ||
if( char === '.' ) { | ||
// We read a second '.' in a row, not a valid email address | ||
// local part | ||
resetToNonEmailAddressState(); | ||
|
||
} else if( char === '@' ) { | ||
// We read the '@' character immediately after a dot ('.'), not | ||
// an email address | ||
resetToNonEmailAddressState(); | ||
|
||
} else if( localPartCharRegex.test( char ) ) { | ||
state = State.LocalPart; | ||
|
||
} else { | ||
// Anything else, not an email address | ||
resetToNonEmailAddressState(); | ||
} | ||
} | ||
|
||
|
||
function stateAtSign( char: string ) { | ||
if( domainNameCharRegex.test( char ) ) { | ||
state = State.DomainChar; | ||
|
||
} else { | ||
// Anything else, not an email address | ||
resetToNonEmailAddressState(); | ||
} | ||
} | ||
|
||
function stateDomainChar( char: string ) { | ||
if( char === '.' ) { | ||
state = State.DomainDot; | ||
|
||
} else if( char === '-' ) { | ||
state = State.DomainHyphen; | ||
|
||
} else if( domainNameCharRegex.test( char ) ) { | ||
// Stay in the DomainChar state | ||
|
||
} else { | ||
// Anything else, we potentially matched if the criteria has | ||
// been met | ||
captureMatchIfValidAndReset(); | ||
} | ||
} | ||
|
||
function stateDomainHyphen( char: string ) { | ||
if( char === '-' || char === '.' ) { | ||
// Not valid to have two hyphens ("--") or hypen+dot ("-.") | ||
captureMatchIfValidAndReset(); | ||
|
||
} else if( domainNameCharRegex.test( char ) ) { | ||
state = State.DomainChar; | ||
|
||
} else { | ||
// Anything else | ||
captureMatchIfValidAndReset(); | ||
} | ||
} | ||
|
||
function stateDomainDot( char: string ) { | ||
if( char === '.' || char === '-' ) { | ||
// not valid to have two dots ("..") or dot+hypen (".-") | ||
captureMatchIfValidAndReset(); | ||
|
||
} else if( domainNameCharRegex.test( char ) ) { | ||
state = State.DomainChar; | ||
|
||
// After having read a '.' and then a valid domain character, | ||
// we now know that the domain part of the email is valid, and | ||
// we have found at least a partial EmailMatch (however, the | ||
// email address may have additional characters from this point) | ||
currentEmailAddress = new CurrentEmailAddress( { | ||
...currentEmailAddress, | ||
hasDomainDot: true | ||
} ); | ||
|
||
} else { | ||
// Anything else | ||
captureMatchIfValidAndReset(); | ||
} | ||
} | ||
|
||
|
||
function beginEmailAddress() { | ||
state = State.LocalPart; | ||
currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } ); | ||
} | ||
|
||
function resetToNonEmailAddressState() { | ||
state = State.NonEmailAddress; | ||
currentEmailAddress = noCurrentEmailAddress | ||
} | ||
|
||
|
||
/* | ||
* Captures the current email address as an EmailMatch if it's valid, | ||
* and resets the state to read another email address. | ||
*/ | ||
function captureMatchIfValidAndReset() { | ||
if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address | ||
let emailAddress = text.slice( currentEmailAddress.idx, charIdx ); | ||
|
||
// If we read a '.' or '-' char that ended the email address | ||
// (valid domain name characters, but only valid email address | ||
// characters if they are followed by something else), strip | ||
// it off now | ||
if( /[-.]$/.test( emailAddress ) ){ | ||
emailAddress = emailAddress.slice( 0, -1 ); | ||
} | ||
|
||
matches.push( new EmailMatch( { | ||
tagBuilder : tagBuilder, | ||
matchedText : emailAddress, | ||
offset : currentEmailAddress.idx, | ||
email : emailAddress | ||
} ) ); | ||
} | ||
|
||
resetToNonEmailAddressState(); | ||
} | ||
} | ||
|
||
} | ||
|
||
|
||
const enum State { | ||
NonEmailAddress = 0, | ||
LocalPart, | ||
LocalPartDot, | ||
AtSign, | ||
DomainChar, | ||
DomainHyphen, | ||
DomainDot | ||
} | ||
|
||
|
||
class CurrentEmailAddress { | ||
readonly idx: number; // the index of the first character in the email address | ||
readonly hasDomainDot: boolean; | ||
|
||
constructor( cfg: Partial<CurrentEmailAddress> = {} ) { | ||
this.idx = cfg.idx !== undefined ? cfg.idx : -1; | ||
this.hasDomainDot = !!cfg.hasDomainDot; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.