diff --git a/tests/main_test.py b/tests/main_test.py index 6bfbda27..347711c3 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -13,6 +13,7 @@ import tldextract import tldextract.suffix_list from tldextract.cache import DiskCache +from tldextract.remote import inet_pton, looks_like_ip from tldextract.suffix_list import SuffixListNotFound from tldextract.tldextract import ExtractResult @@ -132,7 +133,18 @@ def test_ip(): ) -def test_looks_like_ip(): +@pytest.mark.skipif(not inet_pton, reason="inet_pton unavailable") +def test_looks_like_ip_with_inet_pton(): + assert looks_like_ip("1.1.1.1", inet_pton) is True + assert looks_like_ip("256.256.256.256", inet_pton) is False + + +def test_looks_like_ip_without_inet_pton(): + assert looks_like_ip("1.1.1.1", None) is True + assert looks_like_ip("256.256.256.256", None) is False + + +def test_similar_to_ip(): assert_extract("1\xe9", ("", "", "1\xe9", "")) @@ -328,6 +340,11 @@ def test_ipv4(): ("", "", "127.0.0.1", ""), expected_ip_data="127.0.0.1", ) + assert_extract( + "http://127\u30020\uff0e0\uff611/foo/bar", + ("", "", "127.0.0.1", ""), + expected_ip_data="127.0.0.1", + ) def test_ipv4_bad(): @@ -339,6 +356,12 @@ def test_ipv4_bad(): def test_ipv4_lookalike(): + assert_extract( + "http://127.0.0/foo/bar", ("", "127.0", "0", ""), expected_ip_data="" + ) + assert_extract( + "http://127.0.0.0x1/foo/bar", ("", "127.0.0", "0x1", ""), expected_ip_data="" + ) assert_extract( "http://127.0.0.1.9/foo/bar", ("", "127.0.0.1", "9", ""), expected_ip_data="" ) diff --git a/tldextract/remote.py b/tldextract/remote.py index 60b86294..ea11b467 100644 --- a/tldextract/remote.py +++ b/tldextract/remote.py @@ -1,9 +1,17 @@ """tldextract helpers for testing and fetching remote resources.""" +from __future__ import annotations + import re -import socket +from collections.abc import Callable from urllib.parse import scheme_chars +inet_pton: Callable[[int, str], bytes] | None +try: + from socket import AF_INET, inet_pton # Availability: Unix, Windows. +except ImportError: + inet_pton = None + IP_RE = re.compile( r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)" r"{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$" @@ -44,18 +52,17 @@ def _schemeless_url(url: str) -> str: return url[double_slashes_start + 2 :] -def looks_like_ip(maybe_ip: str) -> bool: +def looks_like_ip( + maybe_ip: str, pton: Callable[[int, str], bytes] | None = inet_pton +) -> bool: """Check whether the given str looks like an IP address.""" if not maybe_ip[0].isdigit(): return False - try: - socket.inet_aton(maybe_ip) - return True - except (AttributeError, UnicodeError): - if IP_RE.match(maybe_ip): + if pton is not None: + try: + pton(AF_INET, maybe_ip) return True - except OSError: - pass - - return False + except OSError: + return False + return IP_RE.match(maybe_ip) is not None diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py index 41ab2290..6dafcb17 100644 --- a/tldextract/tldextract.py +++ b/tldextract/tldextract.py @@ -251,19 +251,19 @@ def extract_urllib( def _extract_netloc( self, netloc: str, include_psl_private_domains: bool | None ) -> ExtractResult: - labels = ( + netloc_with_ascii_dots = ( netloc.replace("\u3002", "\u002e") .replace("\uff0e", "\u002e") .replace("\uff61", "\u002e") - .split(".") ) + labels = netloc_with_ascii_dots.split(".") suffix_index = self._get_tld_extractor().suffix_index( labels, include_psl_private_domains=include_psl_private_domains ) - if suffix_index == len(labels) and netloc and looks_like_ip(netloc): - return ExtractResult("", netloc, "") + if suffix_index == len(labels) == 4 and looks_like_ip(netloc_with_ascii_dots): + return ExtractResult("", netloc_with_ascii_dots, "") suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else "" subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""