Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate ExtractResult from namedtuple to dataclass #306

Merged
merged 1 commit into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 6 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,6 @@ ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
```

`ExtractResult` is a namedtuple, so it's simple to access the parts you want.

```python
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> (ext.subdomain, ext.domain, ext.suffix)
('forums', 'bbc', 'co.uk')
>>> # rejoin subdomain and domain
>>> '.'.join(ext[:2])
'forums.bbc'
>>> # a common alias
>>> ext.registered_domain
'bbc.co.uk'
```

Note subdomain and suffix are _optional_. Not all URL-like inputs have a
subdomain or a valid suffix.

Expand All @@ -59,17 +45,14 @@ ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_privat
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
```

If you want to rejoin the whole namedtuple, regardless of whether a subdomain
or suffix were found:
To rejoin the original hostname, if it was indeed a valid, registered hostname:

```python
>>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
>>> # this has unwanted dots
>>> '.'.join(ext[:3])
'.127.0.0.1.'
>>> # join each part only if it's truthy
>>> '.'.join(part for part in ext[:3] if part)
'127.0.0.1'
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.registered_domain
'bbc.co.uk'
>>> ext.fqdn
'forums.bbc.co.uk'
```

By default, this package supports the public ICANN TLDs and their exceptions.
Expand Down
5 changes: 3 additions & 2 deletions tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tempfile

import tldextract
from tldextract.tldextract import ExtractResult

FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/fake_suffix_list_fixture.dat"
Expand All @@ -27,8 +28,8 @@ def test_private_extraction() -> None:
"""Test this library's uncached, offline, private domain extraction."""
tld = tldextract.TLDExtract(cache_dir=tempfile.mkdtemp(), suffix_list_urls=[])

assert tld("foo.blogspot.com") == ("foo", "blogspot", "com", False)
assert tld("foo.blogspot.com", include_psl_private_domains=True) == (
assert tld("foo.blogspot.com") == ExtractResult("foo", "blogspot", "com", False)
assert tld("foo.blogspot.com", include_psl_private_domains=True) == ExtractResult(
"",
"foo",
"blogspot.com",
Expand Down
14 changes: 0 additions & 14 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,20 +412,6 @@ def test_ipv4_lookalike() -> None:
)


def test_result_as_dict() -> None:
"""Test that the result is a namedtuple."""
result = extract(
"http://admin:[email protected]:666/secret/admin/interface?param1=42"
)
expected_dict = {
"subdomain": "www",
"domain": "google",
"suffix": "com",
"is_private": False,
}
assert result._asdict() == expected_dict


def test_cache_permission(
mocker: pytest_mock.MockerFixture, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
Expand Down
4 changes: 2 additions & 2 deletions tldextract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def main() -> None:
sys.exit(1)

for i in args.input:
subdomain, domain, suffix, _ = tld_extract(i)
print(f"{subdomain} {domain} {suffix}")
ext = tld_extract(i)
print(f"{ext.subdomain} {ext.domain} {ext.suffix}")
45 changes: 16 additions & 29 deletions tldextract/tldextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@
>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)

`ExtractResult` is a namedtuple, so it's simple to access the parts you want.

>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> (ext.subdomain, ext.domain, ext.suffix)
('forums', 'bbc', 'co.uk')
>>> # rejoin subdomain and domain
>>> '.'.join(ext[:2])
'forums.bbc'
>>> # a common alias
>>> ext.registered_domain
'bbc.co.uk'

Note subdomain and suffix are _optional_. Not all URL-like inputs have a
subdomain or a valid suffix.

Expand All @@ -37,16 +25,13 @@
>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)

If you want to rejoin the whole namedtuple, regardless of whether a subdomain
or suffix were found:
To rejoin the original hostname, if it was indeed a valid, registered hostname:

>>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
>>> # this has unwanted dots
>>> '.'.join(part for part in ext[:3])
'.127.0.0.1.'
>>> # join part only if truthy
>>> '.'.join(part for part in ext[:3] if part)
'127.0.0.1'
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.registered_domain
'bbc.co.uk'
>>> ext.fqdn
'forums.bbc.co.uk'
"""

from __future__ import annotations
Expand All @@ -55,10 +40,8 @@
import os
import urllib.parse
from collections.abc import Collection, Sequence
from dataclasses import dataclass
from functools import wraps
from typing import (
NamedTuple,
)

import idna

Expand All @@ -77,13 +60,17 @@
)


class ExtractResult(NamedTuple):
"""namedtuple of a URL's subdomain, domain, suffix, and flag that indicates if URL has private suffix."""
@dataclass(order=True)
class ExtractResult:
"""A URL's extracted subdomain, domain, and suffix.

Also contains metadata, like a flag that indicates if the URL has a private suffix.
"""

subdomain: str
domain: str
suffix: str
is_private: bool = False
is_private: bool

@property
def registered_domain(self) -> str:
Expand All @@ -110,7 +97,7 @@ def fqdn(self) -> str:
''
"""
if self.suffix and (self.domain or self.is_private):
return ".".join(i for i in self[:3] if i)
return ".".join(i for i in (self.subdomain, self.domain, self.suffix) if i)
return ""

@property
Expand Down Expand Up @@ -291,7 +278,7 @@ def _extract_netloc(
and netloc_with_ascii_dots[-1] == "]"
):
if looks_like_ipv6(netloc_with_ascii_dots[1:-1]):
return ExtractResult("", netloc_with_ascii_dots, "")
return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)

labels = netloc_with_ascii_dots.split(".")

Expand Down