Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(core): Improve domain and url matching for extractDomain and extractUrl #6010

Merged
merged 4 commits into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 91 additions & 25 deletions packages/workflow/src/Extensions/StringExtensions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,95 @@ const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
const CURRENCY_REGEXP =
/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/;

// This won't validate or catch literally valid email address, just what most people
// would expect
/*
Extract the domain part from various inputs, including URLs, email addresses, and plain domains.

/^(?:(?:https?|ftp):\/\/)? // Match optional http, https, or ftp protocols
(?:mailto:)? // Match optional mailto:
(?:\/\/)? // Match optional double slashes
(?:www\.)? // Match optional www prefix
(?:[-\w]*\.)? // Match any optional subdomain
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?:#[^\s]*)?$/i; // Match optional hash fragment
*/
const DOMAIN_EXTRACT_REGEXP =
/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
Matches domain names without the protocol or optional subdomains

/^(?:www\.)? // Match optional www prefix
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?:#[^\s]*)?$/i; // Match optional fragment at the end of the string
*/
const DOMAIN_REGEXP =
/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;

/*
Matches email addresses

/(
( // Capture local part of the email address
([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) // One or more characters not in the set, followed by
a period, followed by one or more characters not in the set
|(".+") // Or one or more characters inside quotes
)
)
@ // Match @ symbol
(?<domain>( // Capture the domain part of the email address
\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] // Match IPv4 address inside brackets
|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) // Or match domain with at least two subdomains and TLD
))/;
*/
const EMAIL_REGEXP =
/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;

// This also might not catch every possible URL
/*
Matches URLs with strict beginning and end of the string checks

/^(?:(?:https?|ftp):\/\/) // Match http, https, or ftp protocols at the start of the string
(?:www\.)? // Match optional www prefix
( // Capture the domain part
(?:(?:[-\w]+\.)+ // Match one or more subdomains
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|localhost // Match localhost
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
)
)
(?::\d+)? // Match optional port number
(?:\/[^\s?#]*)? // Match optional path
(?:\?[^\s#]*)? // Match optional query string
(?=([^\s]+#.*)?) // Positive lookahead for the fragment identifier
#?[^\s]*$/i; // Match optional fragment at the end of the string
*/
const URL_REGEXP_EXACT =
/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;

/*
Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
matching URLs in the middle of a string
*/
const URL_REGEXP =
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/;
/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;

const CHAR_TEST_REGEXP = /\p{L}/u;
const PUNC_TEST_REGEXP = /[!?.]/;
Expand Down Expand Up @@ -182,24 +261,7 @@ function isNumeric(value: string) {
}

function isUrl(value: string) {
let url: URL;
try {
url = new URL(value);
} catch {
return false;
}

// URL constructor tolerates missing `//` after protocol so check manually
for (const scheme of ['http:', 'https:']) {
if (
url.protocol === scheme &&
value.slice(scheme.length, scheme.length + '//'.length) === '//'
) {
return true;
}
}

return false;
return URL_REGEXP_EXACT.test(value);
}

function isDomain(value: string) {
Expand Down Expand Up @@ -272,9 +334,13 @@ function extractDomain(value: string) {
return undefined;
}
return matched.groups?.domain;
} else if (isUrl(value)) {
return new URL(value).hostname;
}

const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
if (domainMatch) {
return domainMatch[1];
}

return undefined;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => {

test('.isUrl should work on a string', () => {
expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true);
OlegIvaniv marked this conversation as resolved.
Show resolved Hide resolved
expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true);
expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "example".isUrl() }}')).toEqual(false);
expect(evaluate('={{ "".isUrl() }}')).toEqual(false);
});

test('.isDomain should work on a string', () => {
expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "example".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false);
expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false);
expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain
expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true);
expect(evaluate('={{ "".isDomain() }}')).toEqual(false);
});

test('.toSnakeCase should work on a string', () => {
Expand All @@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => {
'={{ "I am a test with a url: https://example.net/ and I am a test with an email: [email protected]".extractUrl() }}',
),
).toEqual('https://example.net/');
expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash');
expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined);
expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com');
expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined);
expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh');
expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1');
expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag');
});

test('.extractDomain should work on a string', () => {
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com');
expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com');
expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com');
expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "mailto:[email protected]".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined);
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.org');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.com');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('example.co.uk');
expect(evaluate('={{ "[email protected]".extractDomain() }}')).toEqual('subdomain.example.com');
expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1');
expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh');
expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost');
expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost');
expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com');
expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com');
expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space');
});

test('.extractEmail should work on a string', () => {
Expand Down