diff --git a/README.md b/README.md index cfb9382b..f1aed2dc 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,13 @@ A public suffix is also sometimes called an effective TLD (eTLD). >>> import tldextract >>> tldextract.extract('http://forums.news.cnn.com/') -ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') +ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom -ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') +ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan -ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg') +ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False) ``` `ExtractResult` is a namedtuple, so it's simple to access the parts you want. @@ -50,13 +50,13 @@ subdomain or a valid suffix. ```python >>> tldextract.extract('google.com') -ExtractResult(subdomain='', domain='google', suffix='com') +ExtractResult(subdomain='', domain='google', suffix='com', is_private=False) >>> tldextract.extract('google.notavalidsuffix') -ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='') +ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False) >>> tldextract.extract('http://127.0.0.1:8080/deployed/') -ExtractResult(subdomain='', domain='127.0.0.1', suffix='') +ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False) ``` If you want to rejoin the whole namedtuple, regardless of whether a subdomain @@ -161,21 +161,21 @@ By default, `tldextract` treats public and private domains the same. ```python >>> extract = tldextract.TLDExtract() >>> extract('waiterrant.blogspot.com') -ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com') +ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com', is_private=False) ``` The following overrides this. ```python >>> extract = tldextract.TLDExtract() >>> extract('waiterrant.blogspot.com', include_psl_private_domains=True) -ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com') +ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True) ``` or to change the default for all extract calls, ```python >>> extract = tldextract.TLDExtract( include_psl_private_domains=True) >>> extract('waiterrant.blogspot.com') -ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com') +ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True) ``` The thinking behind the default is, it's the more common case when people diff --git a/tests/custom_suffix_test.py b/tests/custom_suffix_test.py index 8258a53d..9958ae33 100644 --- a/tests/custom_suffix_test.py +++ b/tests/custom_suffix_test.py @@ -26,11 +26,12 @@ def test_private_extraction(): tld = tldextract.TLDExtract(cache_dir=tempfile.mkdtemp(), suffix_list_urls=[]) - assert tld("foo.blogspot.com") == ("foo", "blogspot", "com") + assert tld("foo.blogspot.com") == ("foo", "blogspot", "com", False) assert tld("foo.blogspot.com", include_psl_private_domains=True) == ( "", "foo", "blogspot.com", + True, ) diff --git a/tests/main_test.py b/tests/main_test.py index 6ea57c50..f95ef6ba 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -418,7 +418,12 @@ def test_result_as_dict(): result = extract( "http://admin:password1@www.google.com:666/secret/admin/interface?param1=42" ) - expected_dict = {"subdomain": "www", "domain": "google", "suffix": "com"} + expected_dict = { + "subdomain": "www", + "domain": "google", + "suffix": "com", + "is_private": False, + } assert result._asdict() == expected_dict @@ -460,10 +465,10 @@ def test_include_psl_private_domain_attr(): extract_private = tldextract.TLDExtract(include_psl_private_domains=True) extract_public = tldextract.TLDExtract(include_psl_private_domains=False) assert extract_private("foo.uk.com") == ExtractResult( - subdomain="", domain="foo", suffix="uk.com" + subdomain="", domain="foo", suffix="uk.com", is_private=True ) assert extract_public("foo.uk.com") == ExtractResult( - subdomain="foo", domain="uk", suffix="com" + subdomain="foo", domain="uk", suffix="com", is_private=False ) @@ -478,38 +483,92 @@ def test_tlds_property(): def test_global_extract(): - assert tldextract.extract("foo.blogspot.com") == ExtractResult( - subdomain="foo", domain="blogspot", suffix="com" - ) assert tldextract.extract( - "foo.blogspot.com", include_psl_private_domains=True - ) == ExtractResult(subdomain="", domain="foo", suffix="blogspot.com") + "blogspot.com", include_psl_private_domains=True + ) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True) assert tldextract.extract( - "s3.ap-south-1.amazonaws.com", include_psl_private_domains=True - ) == ExtractResult(subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com") + "foo.blogspot.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", domain="foo", suffix="blogspot.com", is_private=True + ) assert tldextract.extract( "the-quick-brown-fox.ap-south-1.amazonaws.com", include_psl_private_domains=True ) == ExtractResult( - subdomain="the-quick-brown-fox.ap-south-1", domain="amazonaws", suffix="com" + subdomain="the-quick-brown-fox.ap-south-1", + domain="amazonaws", + suffix="com", + is_private=False, ) assert tldextract.extract( "ap-south-1.amazonaws.com", include_psl_private_domains=True - ) == ExtractResult(subdomain="ap-south-1", domain="amazonaws", suffix="com") + ) == ExtractResult( + subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False + ) assert tldextract.extract( "amazonaws.com", include_psl_private_domains=True - ) == ExtractResult(subdomain="", domain="amazonaws", suffix="com") - assert tldextract.extract( - "s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True - ) == ExtractResult(subdomain="", domain="", suffix="s3.cn-north-1.amazonaws.com.cn") + ) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False) assert tldextract.extract( "the-quick-brown-fox.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True, ) == ExtractResult( - subdomain="the-quick-brown-fox.cn-north-1", domain="amazonaws", suffix="com.cn" + subdomain="the-quick-brown-fox.cn-north-1", + domain="amazonaws", + suffix="com.cn", + is_private=False, ) assert tldextract.extract( "cn-north-1.amazonaws.com.cn", include_psl_private_domains=True - ) == ExtractResult(subdomain="cn-north-1", domain="amazonaws", suffix="com.cn") + ) == ExtractResult( + subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False + ) assert tldextract.extract( "amazonaws.com.cn", include_psl_private_domains=True - ) == ExtractResult(subdomain="", domain="amazonaws", suffix="com.cn") + ) == ExtractResult( + subdomain="", domain="amazonaws", suffix="com.cn", is_private=False + ) + assert tldextract.extract( + "another.icann.compute.amazonaws.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", + domain="another", + suffix="icann.compute.amazonaws.com", + is_private=True, + ) + assert tldextract.extract( + "another.s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", + domain="another", + suffix="s3.dualstack.us-east-1.amazonaws.com", + is_private=True, + ) + + assert tldextract.extract( + "s3.ap-south-1.amazonaws.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True + ) + assert tldextract.extract( + "s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", + domain="", + suffix="s3.cn-north-1.amazonaws.com.cn", + is_private=True, + ) + assert tldextract.extract( + "icann.compute.amazonaws.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True + ) + + # Entire URL is private suffix which ends with another private suffix + # i.e. "s3.dualstack.us-east-1.amazonaws.com" ends with "us-east-1.amazonaws.com" + assert tldextract.extract( + "s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True + ) == ExtractResult( + subdomain="", + domain="", + suffix="s3.dualstack.us-east-1.amazonaws.com", + is_private=True, + ) diff --git a/tests/test_trie.py b/tests/test_trie.py index 94c2a7fc..11d1d00c 100644 --- a/tests/test_trie.py +++ b/tests/test_trie.py @@ -5,19 +5,11 @@ def test_nested_dict() -> None: - original_keys_sequence = [ - ["a"], - ["a", "d"], - ["a", "b"], - ["a", "b", "c"], - ["c"], - ["c", "b"], - ["d", "f"], - ] - for keys_sequence in permutations(original_keys_sequence): + suffixes = ["a", "d.a", "b.a", "c.b.a", "c", "b.c", "f.d"] + for suffixes_sequence in permutations(suffixes): trie = Trie() - for keys in keys_sequence: - trie.add_suffix(keys) + for suffix in suffixes_sequence: + trie.add_suffix(suffix) # check each nested value # Top level c assert "c" in trie.matches diff --git a/tldextract/cli.py b/tldextract/cli.py index 59d452ac..09495658 100644 --- a/tldextract/cli.py +++ b/tldextract/cli.py @@ -88,4 +88,5 @@ def main() -> None: sys.exit(1) for i in args.input: - print(" ".join(tld_extract(i))) + subdomain, domain, suffix, _ = tld_extract(i) + print(f"{subdomain} {domain} {suffix}") diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py index 816737da..8c4b6197 100644 --- a/tldextract/tldextract.py +++ b/tldextract/tldextract.py @@ -5,13 +5,13 @@ >>> import tldextract >>> tldextract.extract('http://forums.news.cnn.com/') - ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') + ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom - ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') + ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan - ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg') + ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False) `ExtractResult` is a namedtuple, so it's simple to access the parts you want. @@ -29,23 +29,23 @@ subdomain or a valid suffix. >>> tldextract.extract('google.com') - ExtractResult(subdomain='', domain='google', suffix='com') + ExtractResult(subdomain='', domain='google', suffix='com', is_private=False) >>> tldextract.extract('google.notavalidsuffix') - ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='') + ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False) >>> tldextract.extract('http://127.0.0.1:8080/deployed/') - ExtractResult(subdomain='', domain='127.0.0.1', suffix='') + ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False) If you want to rejoin the whole namedtuple, regardless of whether a subdomain or suffix were found: >>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/') >>> # this has unwanted dots - >>> '.'.join(ext) + >>> '.'.join(part for part in ext[:3]) '.127.0.0.1.' >>> # join part only if truthy - >>> '.'.join(part for part in ext if part) + >>> '.'.join(part for part in ext[:3] if part) '127.0.0.1' """ @@ -78,11 +78,13 @@ class ExtractResult(NamedTuple): - """namedtuple of a URL's subdomain, domain, and suffix.""" + """namedtuple of a URL's subdomain, domain, suffix, + and flag that indicates if URL has private suffix.""" subdomain: str domain: str suffix: str + is_private: bool = False @property def registered_domain(self) -> str: @@ -108,10 +110,10 @@ def fqdn(self) -> str: >>> extract('http://localhost:8080').fqdn '' """ - if self.suffix and self.domain: + if self.suffix and (self.domain or self.is_private): # Disable bogus lint error (https://github.com/PyCQA/pylint/issues/2568) - # pylint: disable-next=not-an-iterable - return ".".join(i for i in self if i) + # pylint: disable-next=not-an-iterable,unsubscriptable-object + return ".".join(i for i in self[:3] if i) return "" @property @@ -249,9 +251,9 @@ def extract_str( >>> extractor = TLDExtract() >>> extractor.extract_str('http://forums.news.cnn.com/') - ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') + ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> extractor.extract_str('http://forums.bbc.co.uk/') - ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') + ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) """ return self._extract_netloc(lenient_netloc(url), include_psl_private_domains) @@ -270,9 +272,9 @@ def extract_urllib( >>> extractor = TLDExtract() >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.news.cnn.com/')) - ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') + ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/')) - ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk') + ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) """ return self._extract_netloc(url.netloc, include_psl_private_domains) @@ -296,7 +298,7 @@ def _extract_netloc( labels = netloc_with_ascii_dots.split(".") - suffix_index = self._get_tld_extractor().suffix_index( + suffix_index, is_private = self._get_tld_extractor().suffix_index( labels, include_psl_private_domains=include_psl_private_domains ) @@ -304,12 +306,12 @@ def _extract_netloc( if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip( netloc_with_ascii_dots ): - return ExtractResult("", netloc_with_ascii_dots, "") + return ExtractResult("", netloc_with_ascii_dots, "", is_private) suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else "" subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else "" domain = labels[suffix_index - 1] if suffix_index else "" - return ExtractResult(subdomain, domain, suffix) + return ExtractResult(subdomain, domain, suffix, is_private) def update(self, fetch_now: bool = False) -> None: """Force fetch the latest suffix list definitions.""" @@ -366,32 +368,46 @@ def _get_tld_extractor(self) -> _PublicSuffixListTLDExtractor: class Trie: """Trie for storing eTLDs with their labels in reverse-order.""" - def __init__(self, matches: dict | None = None, end: bool = False) -> None: + def __init__( + self, matches: dict | None = None, end: bool = False, is_private: bool = False + ) -> None: self.matches = matches if matches else {} self.end = end + self.is_private = is_private @staticmethod - def create(suffixes: Collection[str]) -> Trie: + def create( + public_suffixes: Collection[str], + private_suffixes: Collection[str] | None = None, + ) -> Trie: """Create a Trie from a list of suffixes and return its root node.""" root_node = Trie() - for suffix in suffixes: - suffix_labels = suffix.split(".") - suffix_labels.reverse() - root_node.add_suffix(suffix_labels) + for suffix in public_suffixes: + root_node.add_suffix(suffix) + + if private_suffixes is None: + private_suffixes = [] + + for suffix in private_suffixes: + root_node.add_suffix(suffix, True) return root_node - def add_suffix(self, labels: list[str]) -> None: + def add_suffix(self, suffix: str, is_private: bool = False) -> None: """Append a suffix's labels to this Trie node.""" node = self + labels = suffix.split(".") + labels.reverse() + for label in labels: if label not in node.matches: node.matches[label] = Trie() node = node.matches[label] node.end = True + node.is_private = is_private @wraps(TLD_EXTRACTOR.__call__) @@ -424,7 +440,9 @@ def __init__( self.private_tlds = private_tlds self.tlds_incl_private = frozenset(public_tlds + private_tlds + extra_tlds) self.tlds_excl_private = frozenset(public_tlds + extra_tlds) - self.tlds_incl_private_trie = Trie.create(self.tlds_incl_private) + self.tlds_incl_private_trie = Trie.create( + self.tlds_excl_private, frozenset(private_tlds) + ) self.tlds_excl_private_trie = Trie.create(self.tlds_excl_private) def tlds(self, include_psl_private_domains: bool | None = None) -> frozenset[str]: @@ -440,8 +458,8 @@ def tlds(self, include_psl_private_domains: bool | None = None) -> frozenset[str def suffix_index( self, spl: list[str], include_psl_private_domains: bool | None = None - ) -> int: - """Return the index of the first suffix label. + ) -> tuple[int, bool]: + """Return the index of the first suffix label, and whether it is private. Returns len(spl) if no suffix is found. """ @@ -459,21 +477,21 @@ def suffix_index( decoded_label = _decode_punycode(label) if decoded_label in node.matches: j -= 1 - if node.matches[decoded_label].end: - i = j node = node.matches[decoded_label] + if node.end: + i = j continue is_wildcard = "*" in node.matches if is_wildcard: is_wildcard_exception = "!" + decoded_label in node.matches if is_wildcard_exception: - return j - return j - 1 + return j, node.matches["*"].is_private + return j - 1, node.matches["*"].is_private break - return i + return i, node.is_private def _decode_punycode(label: str) -> str: