Skip to content

Commit

Permalink
Accept only 4 decimal octet IPv4 addresses. Support IPv4 addresses wi…
Browse files Browse the repository at this point in the history
…th unicode dots. (#292)

- IPv4 addresses with unicode dots are now recognized. Closes #287
- IPv4 addresses must have 4 decimal octets. Closes #290

---------

Co-authored-by: John Kurkowski <[email protected]>
  • Loading branch information
elliotwutingfeng and john-kurkowski authored May 26, 2023
1 parent a871d07 commit 1385434
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 16 deletions.
25 changes: 24 additions & 1 deletion tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import tldextract
import tldextract.suffix_list
from tldextract.cache import DiskCache
from tldextract.remote import inet_pton, looks_like_ip
from tldextract.suffix_list import SuffixListNotFound
from tldextract.tldextract import ExtractResult

Expand Down Expand Up @@ -132,7 +133,18 @@ def test_ip():
)


def test_looks_like_ip():
@pytest.mark.skipif(not inet_pton, reason="inet_pton unavailable")
def test_looks_like_ip_with_inet_pton():
assert looks_like_ip("1.1.1.1", inet_pton) is True
assert looks_like_ip("256.256.256.256", inet_pton) is False


def test_looks_like_ip_without_inet_pton():
assert looks_like_ip("1.1.1.1", None) is True
assert looks_like_ip("256.256.256.256", None) is False


def test_similar_to_ip():
assert_extract("1\xe9", ("", "", "1\xe9", ""))


Expand Down Expand Up @@ -328,6 +340,11 @@ def test_ipv4():
("", "", "127.0.0.1", ""),
expected_ip_data="127.0.0.1",
)
assert_extract(
"http://127\u30020\uff0e0\uff611/foo/bar",
("", "", "127.0.0.1", ""),
expected_ip_data="127.0.0.1",
)


def test_ipv4_bad():
Expand All @@ -339,6 +356,12 @@ def test_ipv4_bad():


def test_ipv4_lookalike():
assert_extract(
"http://127.0.0/foo/bar", ("", "127.0", "0", ""), expected_ip_data=""
)
assert_extract(
"http://127.0.0.0x1/foo/bar", ("", "127.0.0", "0x1", ""), expected_ip_data=""
)
assert_extract(
"http://127.0.0.1.9/foo/bar", ("", "127.0.0.1", "9", ""), expected_ip_data=""
)
Expand Down
29 changes: 18 additions & 11 deletions tldextract/remote.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
"""tldextract helpers for testing and fetching remote resources."""

from __future__ import annotations

import re
import socket
from collections.abc import Callable
from urllib.parse import scheme_chars

inet_pton: Callable[[int, str], bytes] | None
try:
from socket import AF_INET, inet_pton # Availability: Unix, Windows.
except ImportError:
inet_pton = None

IP_RE = re.compile(
r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)"
r"{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
Expand Down Expand Up @@ -44,18 +52,17 @@ def _schemeless_url(url: str) -> str:
return url[double_slashes_start + 2 :]


def looks_like_ip(maybe_ip: str) -> bool:
def looks_like_ip(
maybe_ip: str, pton: Callable[[int, str], bytes] | None = inet_pton
) -> bool:
"""Check whether the given str looks like an IP address."""
if not maybe_ip[0].isdigit():
return False

try:
socket.inet_aton(maybe_ip)
return True
except (AttributeError, UnicodeError):
if IP_RE.match(maybe_ip):
if pton is not None:
try:
pton(AF_INET, maybe_ip)
return True
except OSError:
pass

return False
except OSError:
return False
return IP_RE.match(maybe_ip) is not None
8 changes: 4 additions & 4 deletions tldextract/tldextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,19 +251,19 @@ def extract_urllib(
def _extract_netloc(
self, netloc: str, include_psl_private_domains: bool | None
) -> ExtractResult:
labels = (
netloc_with_ascii_dots = (
netloc.replace("\u3002", "\u002e")
.replace("\uff0e", "\u002e")
.replace("\uff61", "\u002e")
.split(".")
)
labels = netloc_with_ascii_dots.split(".")

suffix_index = self._get_tld_extractor().suffix_index(
labels, include_psl_private_domains=include_psl_private_domains
)

if suffix_index == len(labels) and netloc and looks_like_ip(netloc):
return ExtractResult("", netloc, "")
if suffix_index == len(labels) == 4 and looks_like_ip(netloc_with_ascii_dots):
return ExtractResult("", netloc_with_ascii_dots, "")

suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else ""
subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""
Expand Down

0 comments on commit 1385434

Please sign in to comment.