Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for IPv6 addresses #298

Merged
merged 5 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import tldextract
import tldextract.suffix_list
from tldextract.cache import DiskCache
from tldextract.remote import inet_pton, looks_like_ip
from tldextract.remote import inet_pton, lenient_netloc, looks_like_ip
from tldextract.suffix_list import SuffixListNotFound
from tldextract.tldextract import ExtractResult

Expand All @@ -32,6 +32,7 @@ def assert_extract(
url: str,
expected_domain_data: tuple[str, str, str, str],
expected_ip_data: str = "",
expected_ipv6_data: str = "",
funs: Sequence[tldextract.TLDExtract] = (
extract,
extract_no_cache,
Expand All @@ -58,6 +59,7 @@ def assert_extract(
assert expected_domain == ext.domain
assert expected_tld == ext.suffix
assert expected_ip_data == ext.ipv4
assert expected_ipv6_data == ext.ipv6


def test_american():
Expand Down Expand Up @@ -133,6 +135,26 @@ def test_ip():
)


def test_lenient_netloc():
assert lenient_netloc("https://example.com.ca") == "example.com.ca"
assert lenient_netloc("https://[example.com.ca]") == "[example.com.ca]"
assert lenient_netloc("https://[example.com.ca]:5000") == "[example.com.ca]"
assert (
lenient_netloc("https://[aBcD:ef01:2345:6789:aBcD:ef01::]:5000")
== "[aBcD:ef01:2345:6789:aBcD:ef01::]"
)
assert (
lenient_netloc("https://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:5000")
== "[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]"
)
assert (
lenient_netloc(
"https://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]:5000"
)
== "[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]"
)


@pytest.mark.skipif(not inet_pton, reason="inet_pton unavailable")
def test_looks_like_ip_with_inet_pton():
assert looks_like_ip("1.1.1.1", inet_pton) is True
Expand Down Expand Up @@ -272,6 +294,21 @@ def test_username():
"ftp://johndoe:[email protected]:2501",
("1337.warez.com", "1337", "warez", "com"),
)
assert_extract(
"https://apple:[email protected]:50/a",
("", "", "127.0.0.1", ""),
expected_ip_data="127.0.0.1",
)
assert_extract(
"https://apple:pass@[::]:50/a",
("", "", "[::]", ""),
expected_ipv6_data="::",
)
assert_extract(
"https://apple:pass@[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:50/a",
("", "", "[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]", ""),
expected_ipv6_data="aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1",
)


def test_query_fragment():
Expand Down
30 changes: 25 additions & 5 deletions tldextract/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import re
from collections.abc import Callable
from ipaddress import AddressValueError, IPv6Address
from urllib.parse import scheme_chars

inet_pton: Callable[[int, str], bytes] | None
try:
from socket import AF_INET, inet_pton # Availability: Unix, Windows.
from socket import AF_INET, AF_INET6, inet_pton # Availability: Unix, Windows.
except ImportError:
inet_pton = None

Expand All @@ -27,16 +28,18 @@ def lenient_netloc(url: str) -> str:
urllib.parse.{urlparse,urlsplit}, but extract more leniently, without
raising errors.
"""
return (
after_userinfo = (
_schemeless_url(url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
.rpartition("@")[-1]
.partition(":")[0]
.strip()
.rstrip(".\u3002\uff0e\uff61")
)
if after_userinfo and after_userinfo[0] == "[":
maybe_ipv6 = after_userinfo.partition("]")
if maybe_ipv6[1] == "]":
return f"{maybe_ipv6[0]}]"
return after_userinfo.partition(":")[0].strip().rstrip(".\u3002\uff0e\uff61")


def _schemeless_url(url: str) -> str:
Expand Down Expand Up @@ -66,3 +69,20 @@ def looks_like_ip(
except OSError:
return False
return IP_RE.fullmatch(maybe_ip) is not None


def looks_like_ipv6(
maybe_ip: str, pton: Callable[[int, str], bytes] | None = inet_pton
) -> bool:
"""Check whether the given str looks like an IPv6 address."""
if pton is not None:
try:
pton(AF_INET6, maybe_ip)
return True
except OSError:
return False
try:
IPv6Address(maybe_ip)
except AddressValueError:
return False
return True
34 changes: 33 additions & 1 deletion tldextract/tldextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
import idna

from .cache import DiskCache, get_cache_dir
from .remote import lenient_netloc, looks_like_ip
from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6
from .suffix_list import get_suffix_lists

LOG = logging.getLogger("tldextract")
Expand Down Expand Up @@ -134,6 +134,29 @@ def ipv4(self) -> str:
return self.domain
return ""

@property
def ipv6(self) -> str:
"""
Returns the ipv6 if that is what the presented domain/url is.

>>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file').ipv6
'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1'
>>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file').ipv6
''
>>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]').ipv6
''
"""
if (
len(self.domain) >= 4
and self.domain[0] == "["
and self.domain[-1] == "]"
and not (self.suffix or self.subdomain) # Shortest ipv6 address is "[::]"
):
debracketed = self.domain[1:-1]
if looks_like_ipv6(debracketed):
return debracketed
return ""


class TLDExtract:
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
Expand Down Expand Up @@ -260,6 +283,15 @@ def _extract_netloc(
.replace("\uff0e", "\u002e")
.replace("\uff61", "\u002e")
)

if (
len(netloc_with_ascii_dots) >= 4
and netloc_with_ascii_dots[0] == "["
and netloc_with_ascii_dots[-1] == "]"
):
if looks_like_ipv6(netloc_with_ascii_dots[1:-1]):
return ExtractResult("", netloc_with_ascii_dots, "")

labels = netloc_with_ascii_dots.split(".")

suffix_index = self._get_tld_extractor().suffix_index(
Expand Down