Skip to content

Commit

Permalink
fix(core): Improve domain and url matching for extractDomain and extr…
Browse files Browse the repository at this point in the history
…actUrl (#6010)

* fix(core): Fix domain and url matching for isDomain/isUrl/extractDomain/extractUrl

* Document regex and include www in the domain

* Lint fix
  • Loading branch information
OlegIvaniv authored Apr 20, 2023
1 parent 71ed1f4 commit 33fb732
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 25 deletions.
116 changes: 91 additions & 25 deletions packages/workflow/src/Extensions/StringExtensions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,95 @@ const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
const CURRENCY_REGEXP =
/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/;

// This won't validate or catch literally valid email address, just what most people
// would expect
/*
Extract the domain part from various inputs, including URLs, email addresses, and plain domains.
/^(?:(?:https?|ftp):\/\/)? // Match optional http, https, or ftp protocols
(?:mailto:)? // Match optional mailto:
(?:\/\/)? // Match optional double slashes
(?:www\.)? // Match optional www prefix
(?:[-\w]*\.)? // Match any optional subdomain
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?:#[^\s]*)?$/i; // Match optional hash fragment
*/
const DOMAIN_EXTRACT_REGEXP =
/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
Matches domain names without the protocol or optional subdomains
/^(?:www\.)? // Match optional www prefix
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?:#[^\s]*)?$/i; // Match optional fragment at the end of the string
*/
const DOMAIN_REGEXP =
/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
Matches email addresses
/(
( // Capture local part of the email address
([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) // One or more characters not in the set, followed by
a period, followed by one or more characters not in the set
|(".+") // Or one or more characters inside quotes
)
)
@ // Match @ symbol
(?<domain>( // Capture the domain part of the email address
\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] // Match IPv4 address inside brackets
|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) // Or match domain with at least two subdomains and TLD
))/;
*/
const EMAIL_REGEXP =
/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;

// This also might not catch every possible URL
/*
Matches URLs with strict beginning and end of the string checks
/^(?:(?:https?|ftp):\/\/) // Match http, https, or ftp protocols at the start of the string
(?:www\.)? // Match optional www prefix
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?#]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?=([^\s]+#.*)?) // Positive lookahead for the fragment identifier
#?[^\s]*$/i; // Match optional fragment at the end of the string
*/
const URL_REGEXP_EXACT =
/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;

/*
Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
matching URLs in the middle of a string
*/
const URL_REGEXP =
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/;
/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;

const CHAR_TEST_REGEXP = /\p{L}/u;
const PUNC_TEST_REGEXP = /[!?.]/;
Expand Down Expand Up @@ -182,24 +261,7 @@ function isNumeric(value: string) {
}

function isUrl(value: string) {
let url: URL;
try {
url = new URL(value);
} catch {
return false;
}

// URL constructor tolerates missing `//` after protocol so check manually
for (const scheme of ['http:', 'https:']) {
if (
url.protocol === scheme &&
value.slice(scheme.length, scheme.length + '//'.length) === '//'
) {
return true;
}
}

return false;
return URL_REGEXP_EXACT.test(value);
}

function isDomain(value: string) {
Expand Down Expand Up @@ -272,9 +334,13 @@ function extractDomain(value: string) {
return undefined;
}
return matched.groups?.domain;
} else if (isUrl(value)) {
return new URL(value).hostname;
}

const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
if (domainMatch) {
return domainMatch[1];
}

return undefined;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => {

test('.isUrl should work on a string', () => {
expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "example".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "".isUrl() }}')).toEqual(false);
});

test('.isDomain should work on a string', () => {
expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "example".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false);
expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain
expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "".isDomain() }}')).toEqual(false);
});

test('.toSnakeCase should work on a string', () => {
Expand All @@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => {
'={{ "I am a test with a url: https://example.net/ and I am a test with an email: [email protected]".extractUrl() }}',
),
).toEqual('https://example.net/');
expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash');
expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined);
expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com');
expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined);
expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh');
expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1');
expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag');
});

test('.extractDomain should work on a string', () => {
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com');
expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com');
expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com');
expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "mailto:[email protected]".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined);
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.co.uk');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('subdomain.example.com');
expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1');
expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh');
expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost');
expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost');
expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com');
expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com');
expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space');
});

test('.extractEmail should work on a string', () => {
Expand Down

0 comments on commit 33fb732

Please sign in to comment.