From 33fb73217dca68244c93296f1a4be96cc83e4480 Mon Sep 17 00:00:00 2001 From: OlegIvaniv Date: Thu, 20 Apr 2023 09:01:27 +0200 Subject: [PATCH] fix(core): Improve domain and url matching for extractDomain and extractUrl (#6010) * fix(core): Fix domain and url matching for isDomain/isUrl/extractDomain/extractUrl * Document regex and include www in the domain * Lint fix --- .../src/Extensions/StringExtensions.ts | 116 ++++++++++++++---- .../StringExtensions.test.ts | 62 ++++++++++ 2 files changed, 153 insertions(+), 25 deletions(-) diff --git a/packages/workflow/src/Extensions/StringExtensions.ts b/packages/workflow/src/Extensions/StringExtensions.ts index fb79704094dc5..00343a05fd01c 100644 --- a/packages/workflow/src/Extensions/StringExtensions.ts +++ b/packages/workflow/src/Extensions/StringExtensions.ts @@ -21,16 +21,95 @@ const hashFunctions: Record = { // All symbols from https://www.xe.com/symbols/ as for 2022/11/09 const CURRENCY_REGEXP = /(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu; -const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/; -// This won't validate or catch literally valid email address, just what most people -// would expect +/* + Extract the domain part from various inputs, including URLs, email addresses, and plain domains. + + /^(?:(?:https?|ftp):\/\/)? // Match optional http, https, or ftp protocols + (?:mailto:)? // Match optional mailto: + (?:\/\/)? // Match optional double slashes + (?:www\.)? // Match optional www prefix + (?:[-\w]*\.)? // Match any optional subdomain + ( // Capture the domain part + (?:(?:[-\w]+\.)+ // Match one or more subdomains + (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai) + |localhost // Match localhost + |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses + ) + ) + (?::\d+)? // Match optional port number + (?:\/[^\s?]*)? // Match optional path + (?:\?[^\s#]*)? // Match optional query string + (?:#[^\s]*)?$/i; // Match optional hash fragment +*/ +const DOMAIN_EXTRACT_REGEXP = + /^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i; + +/* + Matches domain names without the protocol or optional subdomains + + /^(?:www\.)? // Match optional www prefix + ( // Capture the domain part + (?:(?:[-\w]+\.)+ // Match one or more subdomains + (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN + |localhost // Match localhost + |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses + ) + ) + (?::\d+)? // Match optional port number + (?:\/[^\s?]*)? // Match optional path + (?:\?[^\s#]*)? // Match optional query string + (?:#[^\s]*)?$/i; // Match optional fragment at the end of the string +*/ +const DOMAIN_REGEXP = + /^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i; + +/* + Matches email addresses + + /( + ( // Capture local part of the email address + ([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) // One or more characters not in the set, followed by + a period, followed by one or more characters not in the set + |(".+") // Or one or more characters inside quotes + ) + ) + @ // Match @ symbol + (?( // Capture the domain part of the email address + \[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] // Match IPv4 address inside brackets + |(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) // Or match domain with at least two subdomains and TLD + ))/; +*/ const EMAIL_REGEXP = /(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/; -// This also might not catch every possible URL +/* + Matches URLs with strict beginning and end of the string checks + + /^(?:(?:https?|ftp):\/\/) // Match http, https, or ftp protocols at the start of the string + (?:www\.)? // Match optional www prefix + ( // Capture the domain part + (?:(?:[-\w]+\.)+ // Match one or more subdomains + (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN + |localhost // Match localhost + |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses + ) + ) + (?::\d+)? // Match optional port number + (?:\/[^\s?#]*)? // Match optional path + (?:\?[^\s#]*)? // Match optional query string + (?=([^\s]+#.*)?) // Positive lookahead for the fragment identifier + #?[^\s]*$/i; // Match optional fragment at the end of the string +*/ +const URL_REGEXP_EXACT = + /^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i; + +/* + Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for + matching URLs in the middle of a string +*/ const URL_REGEXP = - /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/; + /(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i; const CHAR_TEST_REGEXP = /\p{L}/u; const PUNC_TEST_REGEXP = /[!?.]/; @@ -182,24 +261,7 @@ function isNumeric(value: string) { } function isUrl(value: string) { - let url: URL; - try { - url = new URL(value); - } catch { - return false; - } - - // URL constructor tolerates missing `//` after protocol so check manually - for (const scheme of ['http:', 'https:']) { - if ( - url.protocol === scheme && - value.slice(scheme.length, scheme.length + '//'.length) === '//' - ) { - return true; - } - } - - return false; + return URL_REGEXP_EXACT.test(value); } function isDomain(value: string) { @@ -272,9 +334,13 @@ function extractDomain(value: string) { return undefined; } return matched.groups?.domain; - } else if (isUrl(value)) { - return new URL(value).hostname; } + + const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP); + if (domainMatch) { + return domainMatch[1]; + } + return undefined; } diff --git a/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts b/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts index 11230cb205555..2b31634480d24 100644 --- a/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts +++ b/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts @@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => { test('.isUrl should work on a string', () => { expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true); expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true); + expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "example".isUrl() }}')).toEqual(false); + expect(evaluate('={{ "".isUrl() }}')).toEqual(false); }); test('.isDomain should work on a string', () => { expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true); expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false); expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true); + expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true); + expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true); + expect(evaluate('={{ "example".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false); + expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false); + expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true); + expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain + expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true); + expect(evaluate('={{ "".isDomain() }}')).toEqual(false); }); test('.toSnakeCase should work on a string', () => { @@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => { '={{ "I am a test with a url: https://example.net/ and I am a test with an email: test@example.org".extractUrl() }}', ), ).toEqual('https://example.net/'); + expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash'); + expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined); + expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com'); + expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined); + expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh'); + expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1'); + expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag'); }); test('.extractDomain should work on a string', () => { expect(evaluate('={{ "test@example.org".extractDomain() }}')).toEqual('example.org'); expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org'); + expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com'); + expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org'); + expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com'); + expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com'); + expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com'); + expect(evaluate('={{ "mailto:john.doe@example.com".extractDomain() }}')).toEqual('example.com'); + expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined); + expect(evaluate('={{ "jane.doe@example.org".extractDomain() }}')).toEqual('example.org'); + expect(evaluate('={{ "name+tag@example.com".extractDomain() }}')).toEqual('example.com'); + expect(evaluate('={{ "first.last@example.co.uk".extractDomain() }}')).toEqual('example.co.uk'); + expect(evaluate('={{ "user@subdomain.example.com".extractDomain() }}')).toEqual('subdomain.example.com'); + expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net'); + expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1'); + expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh'); + expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost'); + expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost'); + expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com'); + expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com'); + expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space'); }); test('.extractEmail should work on a string', () => {