Skip to content

Commit

Permalink
Migrate to WHATWG URL everywhere
Browse files Browse the repository at this point in the history
Also discovers & resolves a few URL related issues:
- WHATWG parses any potentially valid integer string as an IP - add explicit IP detection to avoid this case and preserve existing parsing functionality
- WHATWG does not fuzzy parse ipv6 addresses, so add simple regex-based ipv6 extraction to handle this case and preserve existing functionality
- WHATWG does not parse scheme-less URLs so change prefixing behavior to include a scheme when falling back to URL-based parsing
  • Loading branch information
jesseditson committed Oct 16, 2024
1 parent 7056afb commit d956699
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
39 changes: 28 additions & 11 deletions lib/clean-host.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ var isValid = require('./is-valid.js');
*/

// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
var hasPrefixRE = /^(([a-z][a-z0-9+.-]*)?:)?\/\//;
var hasPrefixRE = /^(([a-z][a-z0-9+.-]+)?:)\/\//;
var ipLikeRE = /(\d+\.){3}\d+/i;
var ipv6LikeRE = /(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}|::1)/i;


/**
Expand Down Expand Up @@ -74,30 +76,45 @@ module.exports = function extractHostname(value) {
url = '' + url;
}

var needsTrimming = checkTrimmingNeeded(url);
if (needsTrimming) {
url = url.trim();
}

var needsLowerCase = checkLowerCaseNeeded(url);
if (needsLowerCase) {
url = url.toLowerCase();
}

var v6Match = url.match(ipv6LikeRE);
if (v6Match) {
return v6Match[1];
}

var needsTrimming = checkTrimmingNeeded(url);
if (needsTrimming) {
url = url.trim();
}

// Try again after `url` has been transformed to lowercase and trimmed.
if ((needsLowerCase || needsTrimming) && isValid(url)) {
return trimTrailingDots(url);
}

// Proceed with heavier url parsing to extract the hostname.
if (!hasPrefixRE.test(url)) {
url = '//' + url;
url = 'https://' + url;
}

var parts = new URL(url);

if (parts.hostname) {
return trimTrailingDots(parts.hostname);
try {
var parts = new URL(url);

if (parts.hostname) {
// WHATWG URL parses any integer sequence as an IP, whereas the legacy
// node URL module would not. Preserve behavior where non-ip-like strings
// will not result in valid hostnames.
if (ipLikeRE.test(parts.hostname) && !ipLikeRE.test(value)) {
return value;
}
return trimTrailingDots(parts.hostname);
}
} catch (e) {
// Invalid URL
}

return null;
Expand Down
2 changes: 1 addition & 1 deletion test/tld.js
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ describe('tld.js', function () {
});

it('should return the initial value if it is not a valid hostname', function(){
expect(tld.extractHostname(42)).to.equal('42');
expect(tld.extractHostname(42)).to.equal(42);
});

it('should return www.nytimes.com even with an URL as a parameter', function(){
Expand Down

0 comments on commit d956699

Please sign in to comment.