john-kurkowski · john-kurkowski · Jul 13, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 7, 2023
diff --git a/tests/main_test.py b/tests/main_test.py
@@ -13,7 +13,7 @@
 import tldextract
 import tldextract.suffix_list
 from tldextract.cache import DiskCache
-from tldextract.remote import inet_pton, looks_like_ip
+from tldextract.remote import inet_pton, lenient_netloc, looks_like_ip
 from tldextract.suffix_list import SuffixListNotFound
 from tldextract.tldextract import ExtractResult
 
@@ -32,6 +32,7 @@ def assert_extract(
     url: str,
     expected_domain_data: tuple[str, str, str, str],
     expected_ip_data: str = "",
+    expected_ipv6_data: str = "",
     funs: Sequence[tldextract.TLDExtract] = (
         extract,
         extract_no_cache,
@@ -58,6 +59,7 @@ def assert_extract(
         assert expected_domain == ext.domain
         assert expected_tld == ext.suffix
         assert expected_ip_data == ext.ipv4
+        assert expected_ipv6_data == ext.ipv6
 
 
 def test_american():
@@ -133,6 +135,26 @@ def test_ip():
     )
 
 
+def test_lenient_netloc():
+    assert lenient_netloc("https://example.com.ca") == "example.com.ca"
+    assert lenient_netloc("https://[example.com.ca]") == "[example.com.ca]"
+    assert lenient_netloc("https://[example.com.ca]:5000") == "[example.com.ca]"
+    assert (
+        lenient_netloc("https://[aBcD:ef01:2345:6789:aBcD:ef01::]:5000")
+        == "[aBcD:ef01:2345:6789:aBcD:ef01::]"
+    )
+    assert (
+        lenient_netloc("https://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:5000")
+        == "[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]"
+    )
+    assert (
+        lenient_netloc(
+            "https://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]:5000"
+        )
+        == "[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]"
+    )
+
+
 @pytest.mark.skipif(not inet_pton, reason="inet_pton unavailable")
 def test_looks_like_ip_with_inet_pton():
     assert looks_like_ip("1.1.1.1", inet_pton) is True
@@ -272,6 +294,21 @@ def test_username():
         "ftp://johndoe:[email protected]:2501",
         ("1337.warez.com", "1337", "warez", "com"),
     )
+    assert_extract(
+        "https://apple:[email protected]:50/a",
+        ("", "", "127.0.0.1", ""),
+        expected_ip_data="127.0.0.1",
+    )
+    assert_extract(
+        "https://apple:pass@[::]:50/a",
+        ("", "", "[::]", ""),
+        expected_ipv6_data="::",
+    )
+    assert_extract(
+        "https://apple:pass@[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:50/a",
+        ("", "", "[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]", ""),
+        expected_ipv6_data="aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1",
+    )
 
 
 def test_query_fragment():

diff --git a/tldextract/remote.py b/tldextract/remote.py
@@ -4,11 +4,12 @@
 
 import re
 from collections.abc import Callable
+from ipaddress import AddressValueError, IPv6Address
 from urllib.parse import scheme_chars
 
 inet_pton: Callable[[int, str], bytes] | None
 try:
-    from socket import AF_INET, inet_pton  # Availability: Unix, Windows.
+    from socket import AF_INET, AF_INET6, inet_pton  # Availability: Unix, Windows.
 except ImportError:
     inet_pton = None
 
@@ -27,16 +28,18 @@ def lenient_netloc(url: str) -> str:
     urllib.parse.{urlparse,urlsplit}, but extract more leniently, without
     raising errors.
     """
-    return (
+    after_userinfo = (
         _schemeless_url(url)
         .partition("/")[0]
         .partition("?")[0]
         .partition("#")[0]
         .rpartition("@")[-1]
-        .partition(":")[0]
-        .strip()
-        .rstrip(".\u3002\uff0e\uff61")
     )
+    if after_userinfo and after_userinfo[0] == "[":
+        maybe_ipv6 = after_userinfo.partition("]")
+        if maybe_ipv6[1] == "]":
+            return f"{maybe_ipv6[0]}]"
+    return after_userinfo.partition(":")[0].strip().rstrip(".\u3002\uff0e\uff61")
 
 
 def _schemeless_url(url: str) -> str:
@@ -66,3 +69,20 @@ def looks_like_ip(
         except OSError:
             return False
     return IP_RE.fullmatch(maybe_ip) is not None
+
+
+def looks_like_ipv6(
+    maybe_ip: str, pton: Callable[[int, str], bytes] | None = inet_pton
+) -> bool:
+    """Check whether the given str looks like an IPv6 address."""
+    if pton is not None:
+        try:
+            pton(AF_INET6, maybe_ip)
+            return True
+        except OSError:
+            return False
+    try:
+        IPv6Address(maybe_ip)
+    except AddressValueError:
+        return False
+    return True
diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py
@@ -63,7 +63,7 @@
 import idna
 
 from .cache import DiskCache, get_cache_dir
-from .remote import lenient_netloc, looks_like_ip
+from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6
 from .suffix_list import get_suffix_lists
 
 LOG = logging.getLogger("tldextract")
@@ -134,6 +134,29 @@ def ipv4(self) -> str:
             return self.domain
         return ""
 
+    @property
+    def ipv6(self) -> str:
+        """
+        Returns the ipv6 if that is what the presented domain/url is.
+
+        >>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file').ipv6
+        'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1'
+        >>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file').ipv6
+        ''
+        >>> extract('http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]').ipv6
+        ''
+        """
+        if (
+            len(self.domain) >= 4
+            and self.domain[0] == "["
+            and self.domain[-1] == "]"
+            and not (self.suffix or self.subdomain)  # Shortest ipv6 address is "[::]"
+        ):
+            debracketed = self.domain[1:-1]
+            if looks_like_ipv6(debracketed):
+                return debracketed
+        return ""
+
 
 class TLDExtract:
     """A callable for extracting, subdomain, domain, and suffix components from a URL."""
@@ -260,6 +283,15 @@ def _extract_netloc(
             .replace("\uff0e", "\u002e")
             .replace("\uff61", "\u002e")
         )
+
+        if (
+            len(netloc_with_ascii_dots) >= 4
+            and netloc_with_ascii_dots[0] == "["
+            and netloc_with_ascii_dots[-1] == "]"
+        ):
+            if looks_like_ipv6(netloc_with_ascii_dots[1:-1]):
+                return ExtractResult("", netloc_with_ascii_dots, "")
+
         labels = netloc_with_ascii_dots.split(".")
 
         suffix_index = self._get_tld_extractor().suffix_index(