Add get port functionality (#25)

* Add get port functionality * Fix style in tests * Update behave and share code * Pass extractor to the private function
openwpm · Apr 10, 2020 · 934fcf9 · 934fcf9
1 parent ff5d63a
commit 934fcf9
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 14 deletions.
diff --git a/domain_utils/domain_utils.py b/domain_utils/domain_utils.py
@@ -32,6 +32,29 @@ def is_ip_address(hostname):
         return False
 
 
+def _adapt_url_for_port_and_scheme(url, extractor):
+    # To handle the case where we have no scheme, but we have a port
+    # we have the following heuristic. Does scheme have a . in it
+    # which is stdlib behavior when not recognizing a netloc due to
+    # lack of //. If TLDExtract, can find a suffix in the _scheme
+    # then it's probably a domain without an http.
+
+    purl = urlparse(url)
+    _scheme = purl.scheme
+
+    if '.' in str(_scheme):
+        # From the docs: "urlparse recognizes a netloc only
+        # if it is properly introduced by ‘//’". So we
+        # prepend to get results we expect.
+        if extractor(_scheme).suffix != '' or is_ip_address(_scheme):
+            url = '//{url}'.format(url=url)
+    elif url == purl.path:
+        # this is the case where the url has no scheme
+        # and we are trying to access the root. Ex: localhost:5000
+        url = '//{url}/'.format(url=url)
+    return url
+
+
 @_load_and_update_extractor
 def _get_tld_extract(url, **kwargs):
     extractor = kwargs.get('extractor')
@@ -194,20 +217,8 @@ def get_stripped_url(url, scheme=False, drop_non_http=False, use_netloc=True, ex
         Returns a url stripped to (scheme)?+(netloc|hostname)+(path)?.
         Returns empty string if appropriate.
     """
-    purl = urlparse(url)
-    _scheme = purl.scheme
 
-    # To handle the case where we have no scheme, but we have a port
-    # we have the following heuristic. Does scheme have a . in it
-    # which is stdlib behavior when not recognizing a netloc due to
-    # lack of //. If TLDExtract, can find a suffix in the _scheme
-    # then it's probably a domain without an http.
-    if '.' in _scheme:
-        # From the docs: "urlparse recognizes a netloc only
-        # if it is properly introduced by ‘//’". So we
-        # prepend to get results we expect.
-        if extractor(_scheme).suffix != '' or is_ip_address(_scheme):
-            url = '//{url}'.format(url=url)
+    url = _adapt_url_for_port_and_scheme(url, extractor)
 
     purl = urlparse(url)
     _scheme = purl.scheme
@@ -219,7 +230,6 @@ def get_stripped_url(url, scheme=False, drop_non_http=False, use_netloc=True, ex
         else:
             return url
 
-    purl = urlparse(url)
     scheme_out = ''
     loc_out = ''
     path_out = purl.path
@@ -264,3 +274,23 @@ def get_scheme(url, no_scheme=NO_SCHEME):
         return scheme
     else:
         return no_scheme
+
+
+@_load_and_update_extractor
+def get_port(url, extractor=None):
+    """
+    Given an url, extract from it port if present
+
+    Parameters
+    ----------
+    url: string
+        The URL from where we want to get the scheme
+
+    Returns
+    ----------
+    int
+        Returns port in the url
+    """
+
+    url = _adapt_url_for_port_and_scheme(url, extractor)
+    return urlparse(url).port
diff --git a/tests/test_get_port.py b/tests/test_get_port.py
@@ -0,0 +1,29 @@
+from domain_utils import get_port
+
+
+def test_no_port():
+    assert get_port('domain.net') is None
+
+
+def test_port_in_ip():
+    assert get_port('10.0.0.1:80/path/to/index.html') == 80
+
+
+def test_port_in_url():
+    assert get_port('example.com:80/path/to/index.html') == 80
+
+
+def test_port_in_domain():
+    assert get_port('example.com:5000') == 5000
+
+
+def test_port_in_ip_no_path():
+    assert get_port('10.0.0.1:80') == 80
+
+
+def test_port_no_tld():
+    assert get_port('localhost:8000') == 8000
+
+
+def test_url_with_protocol():
+    assert get_port('ws://example.com:5000') == 5000