Skip to content

Commit

Permalink
Always include suffix if private suffix enabled and private suffix ex…
Browse files Browse the repository at this point in the history
…ists (#300)

Closes #178.

---------

Co-authored-by: John Kurkowski <[email protected]>
  • Loading branch information
elliotwutingfeng and john-kurkowski authored Sep 13, 2023
1 parent 314b982 commit 789f6ef
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 76 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ A public suffix is also sometimes called an effective TLD (eTLD).
>>> import tldextract

>>> tldextract.extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)

>>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)

>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
```

`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
Expand All @@ -50,13 +50,13 @@ subdomain or a valid suffix.

```python
>>> tldextract.extract('google.com')
ExtractResult(subdomain='', domain='google', suffix='com')
ExtractResult(subdomain='', domain='google', suffix='com', is_private=False)

>>> tldextract.extract('google.notavalidsuffix')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False)

>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
```

If you want to rejoin the whole namedtuple, regardless of whether a subdomain
Expand Down Expand Up @@ -161,21 +161,21 @@ By default, `tldextract` treats public and private domains the same.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com', is_private=False)
```

The following overrides this.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
```

or to change the default for all extract calls,
```python
>>> extract = tldextract.TLDExtract( include_psl_private_domains=True)
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
```

The thinking behind the default is, it's the more common case when people
Expand Down
3 changes: 2 additions & 1 deletion tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
def test_private_extraction():
tld = tldextract.TLDExtract(cache_dir=tempfile.mkdtemp(), suffix_list_urls=[])

assert tld("foo.blogspot.com") == ("foo", "blogspot", "com")
assert tld("foo.blogspot.com") == ("foo", "blogspot", "com", False)
assert tld("foo.blogspot.com", include_psl_private_domains=True) == (
"",
"foo",
"blogspot.com",
True,
)


Expand Down
97 changes: 78 additions & 19 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,12 @@ def test_result_as_dict():
result = extract(
"http://admin:[email protected]:666/secret/admin/interface?param1=42"
)
expected_dict = {"subdomain": "www", "domain": "google", "suffix": "com"}
expected_dict = {
"subdomain": "www",
"domain": "google",
"suffix": "com",
"is_private": False,
}
assert result._asdict() == expected_dict


Expand Down Expand Up @@ -460,10 +465,10 @@ def test_include_psl_private_domain_attr():
extract_private = tldextract.TLDExtract(include_psl_private_domains=True)
extract_public = tldextract.TLDExtract(include_psl_private_domains=False)
assert extract_private("foo.uk.com") == ExtractResult(
subdomain="", domain="foo", suffix="uk.com"
subdomain="", domain="foo", suffix="uk.com", is_private=True
)
assert extract_public("foo.uk.com") == ExtractResult(
subdomain="foo", domain="uk", suffix="com"
subdomain="foo", domain="uk", suffix="com", is_private=False
)


Expand All @@ -478,38 +483,92 @@ def test_tlds_property():


def test_global_extract():
assert tldextract.extract("foo.blogspot.com") == ExtractResult(
subdomain="foo", domain="blogspot", suffix="com"
)
assert tldextract.extract(
"foo.blogspot.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="foo", suffix="blogspot.com")
"blogspot.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True)
assert tldextract.extract(
"s3.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com")
"foo.blogspot.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="foo", suffix="blogspot.com", is_private=True
)
assert tldextract.extract(
"the-quick-brown-fox.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="the-quick-brown-fox.ap-south-1", domain="amazonaws", suffix="com"
subdomain="the-quick-brown-fox.ap-south-1",
domain="amazonaws",
suffix="com",
is_private=False,
)
assert tldextract.extract(
"ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="ap-south-1", domain="amazonaws", suffix="com")
) == ExtractResult(
subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False
)
assert tldextract.extract(
"amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com")
assert tldextract.extract(
"s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="s3.cn-north-1.amazonaws.com.cn")
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False)
assert tldextract.extract(
"the-quick-brown-fox.cn-north-1.amazonaws.com.cn",
include_psl_private_domains=True,
) == ExtractResult(
subdomain="the-quick-brown-fox.cn-north-1", domain="amazonaws", suffix="com.cn"
subdomain="the-quick-brown-fox.cn-north-1",
domain="amazonaws",
suffix="com.cn",
is_private=False,
)
assert tldextract.extract(
"cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="cn-north-1", domain="amazonaws", suffix="com.cn")
) == ExtractResult(
subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False
)
assert tldextract.extract(
"amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com.cn")
) == ExtractResult(
subdomain="", domain="amazonaws", suffix="com.cn", is_private=False
)
assert tldextract.extract(
"another.icann.compute.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="another",
suffix="icann.compute.amazonaws.com",
is_private=True,
)
assert tldextract.extract(
"another.s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="another",
suffix="s3.dualstack.us-east-1.amazonaws.com",
is_private=True,
)

assert tldextract.extract(
"s3.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True
)
assert tldextract.extract(
"s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="",
suffix="s3.cn-north-1.amazonaws.com.cn",
is_private=True,
)
assert tldextract.extract(
"icann.compute.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True
)

# Entire URL is private suffix which ends with another private suffix
# i.e. "s3.dualstack.us-east-1.amazonaws.com" ends with "us-east-1.amazonaws.com"
assert tldextract.extract(
"s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="",
suffix="s3.dualstack.us-east-1.amazonaws.com",
is_private=True,
)
16 changes: 4 additions & 12 deletions tests/test_trie.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,11 @@


def test_nested_dict() -> None:
original_keys_sequence = [
["a"],
["a", "d"],
["a", "b"],
["a", "b", "c"],
["c"],
["c", "b"],
["d", "f"],
]
for keys_sequence in permutations(original_keys_sequence):
suffixes = ["a", "d.a", "b.a", "c.b.a", "c", "b.c", "f.d"]
for suffixes_sequence in permutations(suffixes):
trie = Trie()
for keys in keys_sequence:
trie.add_suffix(keys)
for suffix in suffixes_sequence:
trie.add_suffix(suffix)
# check each nested value
# Top level c
assert "c" in trie.matches
Expand Down
3 changes: 2 additions & 1 deletion tldextract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,5 @@ def main() -> None:
sys.exit(1)

for i in args.input:
print(" ".join(tld_extract(i)))
subdomain, domain, suffix, _ = tld_extract(i)
print(f"{subdomain} {domain} {suffix}")
Loading

0 comments on commit 789f6ef

Please sign in to comment.