Skip to content

Commit

Permalink
Added pagination parser for Samair/Preempt proxy, more work on ProxyO…
Browse files Browse the repository at this point in the history
…bject #30, some log house keeping
  • Loading branch information
pgaref committed Jul 30, 2017
1 parent 37a844a commit b875087
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 44 deletions.
14 changes: 5 additions & 9 deletions http_request_randomizer/requests/parsers/FreeProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,15 @@ def parse_proxyList(self):
datasets.append(dataset)

for dataset in datasets:
proxy_obj = self.createProxyObject(dataset)
proxy_obj = self.create_proxy_object(dataset)
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(proxy_obj.getAddress()):
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
proxy_obj.print_everything()
else:
logger.debug("Address with Invalid format: {}".format(proxy_obj.getAddress()))
# print "{0:<10}: {1}".format(field[0], field[1])
# print "ALL: ", curr_proxy_list

logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list

def createProxyObject(self, dataset):
def create_proxy_object(self, dataset):
# Check Field[0] for tags and field[1] for values!
ip = ""
port = None
Expand All @@ -58,7 +54,7 @@ def createProxyObject(self, dataset):
if field[0] == 'IP Address':
# Make sure it is a Valid IP
ip = field[1].strip() # String strip()
# TODO @pgaref: Duplicate code!!!
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
Expand Down
14 changes: 6 additions & 8 deletions http_request_randomizer/requests/parsers/ProxyForEuParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,15 @@ def parse_proxyList(self):
datasets.append(dataset)

for dataset in datasets:

# Avoid Straggler proxies and make sure it is a Valid Proxy Address
proxy_obj = self.createProxyObject(dataset)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.getAddress()):
proxy_obj = self.create_proxy_object(dataset)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
proxy_obj.print_everything()
# print "{0:<10}: {1}".format(field[0], field[1])
# print "ALL: ", curr_proxy_list
else:
logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list

def createProxyObject(self, dataset):
def create_proxy_object(self, dataset):
ip = ""
port = None
anonymity = AnonymityLevel.UNKNOWN
Expand All @@ -55,10 +53,10 @@ def createProxyObject(self, dataset):
# Discard slow proxies! Speed is in KB/s
if field[0] == 'Speed':
if float(field[1]) < self.get_min_bandwidth():
logger.debug("Proxy with low bandwidth: {}".format(float(field[1])))
return None
if field[0] == 'IP':
ip = field[1].strip() # String strip()
# TODO @pgaref : Dupicate code?
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
Expand Down
15 changes: 9 additions & 6 deletions http_request_randomizer/requests/parsers/RebroWeeblyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ def parse_proxyList(self, use_top15k=False):
# Parse Top Proxy List page
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(row):
proxy_obj = self.createProxyObject(row)
proxy_obj = self.create_proxy_object(row)
if proxy_obj is not None and UrlParser.valid_ip_port(row):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Address with Invalid format: {}".format(row))
logger.debug("Proxy Invalid: {}".format(row))
# Usually these proxies are stale
if use_top15k:
# Parse 15k Nodes Text file (named *-all-*.txt)
Expand All @@ -49,15 +49,18 @@ def parse_proxyList(self, use_top15k=False):
more_content = requests.get(self.get_URl() + self.txt_proxy_path).text
for proxy_address in more_content.split():
if UrlParser.valid_ip_port(proxy_address):
proxy_obj = self.createProxyObject(row)
proxy_obj = self.create_proxy_object(row)
curr_proxy_list.append(proxy_obj)

return curr_proxy_list

def createProxyObject(self, dataset):
def create_proxy_object(self, dataset):
# Provider specific code
dataset = dataset.strip() # String strip()
ip = dataset.split(":")[0]
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
port = dataset.split(":")[1]
# TODO: Parse extra tables and combine data - Provider seems to be out-of-date
country = "Unknown"
Expand Down
42 changes: 33 additions & 9 deletions http_request_randomizer/requests/parsers/SamairProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ def __init__(self, id, web_url, timeout=None):
def parse_proxyList(self):
curr_proxy_list = []
# Parse all proxy pages -> format: /list/{num}.htm
# TODO @pgaref: get the pageRange from the 'pagination' table
for page in range(1, 21):
response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout)
# Get the pageRange from the 'pagination' table
page_set = self.get_pagination_set()
logger.debug("Pages: {}".format(page_set))
for page in page_set:
response = requests.get("{0}{1}".format(self.get_URl(), page), timeout=self.timeout)
if not response.ok:
# Could not parse ANY page - Let user know
if not curr_proxy_list:
Expand Down Expand Up @@ -50,20 +52,42 @@ def parse_proxyList(self):
for row in table.find_all("tr")[1:]:
td_row = row.find("td")
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
proxy_obj = self.create_proxy_object(row)
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(td_row.text):
proxy_obj = self.createProxyObject(row)
proxy_obj.print_everything()
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Address with Invalid format: {}".format(td_row.text))
logger.debug("Proxy Invalid: {}".format(td_row.text))
return curr_proxy_list

def createProxyObject(self, row):
def get_pagination_set(self):
response = requests.get(self.get_URl(), timeout=self.timeout)
page_set = set()
# Could not parse pagination page - Let user know
if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
return page_set
content = response.content
soup = BeautifulSoup(content, "html.parser")
for ultag in soup.find_all('ul', {'class': 'pagination'}):
for litag in ultag.find_all('li'):
page_ref = litag.a.get('href').decode('utf-8')
# Skip current page '/list'
if page_ref.endswith(('htm', 'html')):
page_set.add(page_ref)
else:
page_set.add("")
return page_set

def create_proxy_object(self, row):
for td_row in row.findAll("td"):
if td_row.attrs['data-label'] == 'IP:port ':
text = td_row.text.strip()
ip = text.split(":")[0]
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
port = text.split(":")[1]
elif td_row.attrs['data-label'] == 'Anonymity Type: ':
anonymity = AnonymityLevel(td_row.text.strip())
Expand All @@ -73,4 +97,4 @@ def createProxyObject(self, row):

def __str__(self):
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
.format(self.url, self.minimum_bandwidth_in_KBs)
12 changes: 6 additions & 6 deletions http_request_randomizer/requests/proxy/ProxyObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ def __init__(self, source, ip, port, anonymity_level, country=None, protocols=[
self.protocols = protocols
self.tunnel = tunnel

def getAddress(self):
def get_address(self):
return "{0}:{1}".format(self.ip, self.port)

def __str__(self):
""" Method is heavily used for Logging - make sure we have a readable output
:return: The address representation of the proxy
"""
return "{0} | {1}".format(self.getAddress(), self.source)
return "{0} | {1}".format(self.get_address(), self.source)

def print_everything(self):
print("Address: {0} | Src: {1} | | Country: {2} | Anonymity: {3} | Protoc: {4} | Tunnel: {5}" \
.format(self.getAddress(), self.source, self.country, self.anonymity_level, self.protocols,
self.tunnel))
def to_str(self):
return "Address: {0} | Src: {1} | | Country: {2} | Anonymity: {3} | Protoc: {4} | Tunnel: {5}"\
.format(self.get_address(), self.source, self.country, self.anonymity_level, self.protocols,
self.tunnel)


class AnonymityLevel(MultiValueEnum):
Expand Down
4 changes: 2 additions & 2 deletions http_request_randomizer/requests/proxy/requestProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
headers.update(req_headers)

self.logger.debug("Using proxy: {0}".format(str(self.current_proxy)))
request = requests.request(method, url, proxies={"http": self.current_proxy.getAddress()},
request = requests.request(method, url, proxies={"http": self.current_proxy.get_address()},
headers=headers, data=data, params=params, timeout=req_timeout)
# Avoid HTTP request errors
if request.status_code == 409:
Expand Down Expand Up @@ -151,7 +151,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
req_proxy = RequestProxy()
print("Initialization took: {0} sec".format((time.time() - start)))
print("Size: {0}".format(len(req_proxy.get_proxy_list())))
print("ALL = {0} ".format(req_proxy.get_proxy_list()))
print("ALL = {0} ".format(map(lambda x: x.get_address(), req_proxy.get_proxy_list())))

test_url = 'http://ipv4.icanhazip.com'

Expand Down
4 changes: 4 additions & 0 deletions tests/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ def samair_mock(url, request):
\n
</tr>
\n
<div id="navbar">
<ul class="pagination"><li class="active"><a href="/list/">1</a></li><li><a href="02.htm">2</a></li></ul>
</div>
\n
<tr class="anon">
<td data-label="IP:port ">191.252.61.28:80</td>
<td data-label="Anonymity Type: ">high-anonymous</td>
Expand Down
8 changes: 4 additions & 4 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_FreeProxyParser(self):
proxy_list = proxy_provider.parse_proxyList()
proxy_list_addr = []
for proxy in proxy_list:
proxy_list_addr.append(proxy.getAddress())
proxy_list_addr.append(proxy.get_address())
self.assertEqual(proxy_list_addr, free_proxy_expected)

def test_ProxyForEuParser(self):
Expand All @@ -34,7 +34,7 @@ def test_ProxyForEuParser(self):
proxy_list = proxy_provider.parse_proxyList()
proxy_list_addr = []
for proxy in proxy_list:
proxy_list_addr.append(proxy.getAddress())
proxy_list_addr.append(proxy.get_address())
self.assertEqual(proxy_list_addr, proxy_for_eu_expected)

def test_RebroWeeblyParser(self):
Expand All @@ -43,7 +43,7 @@ def test_RebroWeeblyParser(self):
proxy_list = proxy_provider.parse_proxyList()
proxy_list_addr = []
for proxy in proxy_list:
proxy_list_addr.append(proxy.getAddress())
proxy_list_addr.append(proxy.get_address())
self.assertEqual(proxy_list_addr, rebro_weebly_expected)

def test_SemairProxyParser(self):
Expand All @@ -52,7 +52,7 @@ def test_SemairProxyParser(self):
proxy_list = proxy_provider.parse_proxyList()
proxy_list_addr = []
for proxy in proxy_list:
proxy_list_addr.append(proxy.getAddress())
proxy_list_addr.append(proxy.get_address())
for item in samair_expected:
self.assertTrue(item in proxy_list_addr)

Expand Down

0 comments on commit b875087

Please sign in to comment.