Skip to content

Commit

Permalink
Add get port functionality (#25)
Browse files Browse the repository at this point in the history
* Add get port functionality

* Fix style in tests

* Update behave and share code

* Pass extractor to the private function
  • Loading branch information
yabirgb authored Apr 10, 2020
1 parent ff5d63a commit 934fcf9
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 14 deletions.
58 changes: 44 additions & 14 deletions domain_utils/domain_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,29 @@ def is_ip_address(hostname):
return False


def _adapt_url_for_port_and_scheme(url, extractor):
# To handle the case where we have no scheme, but we have a port
# we have the following heuristic. Does scheme have a . in it
# which is stdlib behavior when not recognizing a netloc due to
# lack of //. If TLDExtract, can find a suffix in the _scheme
# then it's probably a domain without an http.

purl = urlparse(url)
_scheme = purl.scheme

if '.' in str(_scheme):
# From the docs: "urlparse recognizes a netloc only
# if it is properly introduced by ‘//’". So we
# prepend to get results we expect.
if extractor(_scheme).suffix != '' or is_ip_address(_scheme):
url = '//{url}'.format(url=url)
elif url == purl.path:
# this is the case where the url has no scheme
# and we are trying to access the root. Ex: localhost:5000
url = '//{url}/'.format(url=url)
return url


@_load_and_update_extractor
def _get_tld_extract(url, **kwargs):
extractor = kwargs.get('extractor')
Expand Down Expand Up @@ -194,20 +217,8 @@ def get_stripped_url(url, scheme=False, drop_non_http=False, use_netloc=True, ex
Returns a url stripped to (scheme)?+(netloc|hostname)+(path)?.
Returns empty string if appropriate.
"""
purl = urlparse(url)
_scheme = purl.scheme

# To handle the case where we have no scheme, but we have a port
# we have the following heuristic. Does scheme have a . in it
# which is stdlib behavior when not recognizing a netloc due to
# lack of //. If TLDExtract, can find a suffix in the _scheme
# then it's probably a domain without an http.
if '.' in _scheme:
# From the docs: "urlparse recognizes a netloc only
# if it is properly introduced by ‘//’". So we
# prepend to get results we expect.
if extractor(_scheme).suffix != '' or is_ip_address(_scheme):
url = '//{url}'.format(url=url)
url = _adapt_url_for_port_and_scheme(url, extractor)

purl = urlparse(url)
_scheme = purl.scheme
Expand All @@ -219,7 +230,6 @@ def get_stripped_url(url, scheme=False, drop_non_http=False, use_netloc=True, ex
else:
return url

purl = urlparse(url)
scheme_out = ''
loc_out = ''
path_out = purl.path
Expand Down Expand Up @@ -264,3 +274,23 @@ def get_scheme(url, no_scheme=NO_SCHEME):
return scheme
else:
return no_scheme


@_load_and_update_extractor
def get_port(url, extractor=None):
"""
Given an url, extract from it port if present
Parameters
----------
url: string
The URL from where we want to get the scheme
Returns
----------
int
Returns port in the url
"""

url = _adapt_url_for_port_and_scheme(url, extractor)
return urlparse(url).port
29 changes: 29 additions & 0 deletions tests/test_get_port.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from domain_utils import get_port


def test_no_port():
assert get_port('domain.net') is None


def test_port_in_ip():
assert get_port('10.0.0.1:80/path/to/index.html') == 80


def test_port_in_url():
assert get_port('example.com:80/path/to/index.html') == 80


def test_port_in_domain():
assert get_port('example.com:5000') == 5000


def test_port_in_ip_no_path():
assert get_port('10.0.0.1:80') == 80


def test_port_no_tld():
assert get_port('localhost:8000') == 8000


def test_url_with_protocol():
assert get_port('ws://example.com:5000') == 5000

0 comments on commit 934fcf9

Please sign in to comment.