Skip to content

Commit

Permalink
Merge branch 'release/v1.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
pgaref committed Jul 30, 2017
2 parents ca36467 + 7bb0405 commit 3d3e37a
Show file tree
Hide file tree
Showing 13 changed files with 402 additions and 107 deletions.
21 changes: 21 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,22 @@
*.pyc
### JetBrains ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/workspace.xml
.idea/tasks.xml
.idea/dictionaries
.idea/vcs.xml
.idea/jsLibraryMappings.xml

# Sensitive or high-churn files:
.idea/dataSources.ids
.idea/dataSources.xml
.idea/dataSources.local.xml
.idea/sqlDataSources.xml
.idea/dynamic.xml
.idea/uiDesigner.xml

## File-based project format:
*.iws
53 changes: 31 additions & 22 deletions http_request_randomizer/requests/parsers/FreeProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,22 @@
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


class FreeProxyParser(UrlParser):
def __init__(self, web_url, timeout=None):
UrlParser.__init__(self, web_url, timeout)
def __init__(self, id, web_url, timeout=None):
UrlParser.__init__(self, id, web_url, timeout)

def parse_proxyList(self):
curr_proxy_list = []
response = requests.get(self.get_URl(), timeout=self.timeout)
response = requests.get(self.get_url(), timeout=self.timeout)

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

content = response.content
Expand All @@ -35,28 +36,36 @@ def parse_proxyList(self):
datasets.append(dataset)

for dataset in datasets:
# Check Field[0] for tags and field[1] for values!
address = ""
for field in dataset:
if field[0] == 'IP Address':
# Make sure it is a Valid IP
if not UrlParser.valid_ip(field[1]):
logger.debug("IP with Invalid format: {}".format(field[1]))
break
else:
address += field[1] + ':'
elif field[0] == 'Port':
address += field[1]
proxy_obj = self.create_proxy_object(dataset)
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(address):
proxy = "http://" + address
curr_proxy_list.append(proxy.__str__())
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Address with Invalid format: {}".format(address))
# print "{0:<10}: {1}".format(field[0], field[1])
# print "ALL: ", curr_proxy_list
logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list

def create_proxy_object(self, dataset):
# Check Field[0] for tags and field[1] for values!
ip = ""
port = None
anonymity = AnonymityLevel.UNKNOWN
country = None
for field in dataset:
if field[0] == 'IP Address':
# Make sure it is a Valid IP
ip = field[1].strip() # String strip()
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
elif field[0] == 'Port':
port = field[1].strip() # String strip()
elif field[0] == 'Anonymity':
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
elif field[0] == 'Country':
country = field[1].strip() # String strip()
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

def __str__(self):
return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
62 changes: 36 additions & 26 deletions http_request_randomizer/requests/parsers/ProxyForEuParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,22 @@
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


class ProxyForEuParser(UrlParser):
def __init__(self, web_url, bandwithdh=None, timeout=None):
UrlParser.__init__(self, web_url, bandwithdh, timeout)
def __init__(self, id, web_url, bandwithdh=None, timeout=None):
UrlParser.__init__(self, id, web_url, bandwithdh, timeout)

def parse_proxyList(self):
curr_proxy_list = []
response = requests.get(self.get_URl(), timeout=self.timeout)
response = requests.get(self.get_url(), timeout=self.timeout)

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

content = response.content
Expand All @@ -34,31 +35,40 @@ def parse_proxyList(self):
datasets.append(dataset)

for dataset in datasets:
# Check Field[0] for tags and field[1] for values!
address = ""
proxy_straggler = False
for field in dataset:
# Discard slow proxies! Speed is in KB/s
if field[0] == 'Speed':
if float(field[1]) < self.get_min_bandwidth():
proxy_straggler = True
if field[0] == 'IP':
# Make sure it is a Valid IP
if not UrlParser.valid_ip(field[1]):
logger.debug("IP with Invalid format: {}".format(field[1]))
break
else:
address += field[1] + ':'
elif field[0] == 'Port':
address += field[1]
# Avoid Straggler proxies and make sure it is a Valid Proxy Address
if not proxy_straggler and UrlParser.valid_ip_port(address):
proxy = "http://" + address
curr_proxy_list.append(proxy.__str__())
# print "{0:<10}: {1}".format(field[0], field[1])
# print "ALL: ", curr_proxy_list
proxy_obj = self.create_proxy_object(dataset)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list

def create_proxy_object(self, dataset):
ip = ""
port = None
anonymity = AnonymityLevel.UNKNOWN
country = None
# Check Field[0] for tags and field[1] for values!
for field in dataset:
# Discard slow proxies! Speed is in KB/s
if field[0] == 'Speed':
if float(field[1]) < self.get_min_bandwidth():
logger.debug("Proxy with low bandwidth: {}".format(float(field[1])))
return None
if field[0] == 'IP':
ip = field[1].strip() # String strip()
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
elif field[0] == 'Port':
port = field[1].strip() # String strip()
elif field[0] == 'Anon':
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
elif field[0] == 'Country':
country = field[1].strip() # String strip()
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

def __str__(self):
return "ProxyForEU Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
63 changes: 48 additions & 15 deletions http_request_randomizer/requests/parsers/RebroWeeblyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,87 @@
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


class RebroWeeblyParser(UrlParser):
def __init__(self, web_url, timeout=None):
def __init__(self, id, web_url, timeout=None):
self.top_proxy_path = "proxy-list.html"
self.txt_proxy_path = "txt-lists.html"
UrlParser.__init__(self, web_url, timeout)
UrlParser.__init__(self, id, web_url, timeout)

def parse_proxyList(self, use_top15k=False):
curr_proxy_list = []
response = requests.get(self.get_URl()+"/"+self.top_proxy_path, timeout=self.timeout)
response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
'color': '#33a27f'})
all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# .find('font', attrs={'color': '#33a27f'})
# Parse Top Proxy List page
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
address_list = []
country_list = []
anonymity_list = []
for div in all_divs:
address_div = div.find('font', attrs={'color': '#33a27f'})
if address_div is not None:
for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
address_list.append(str(row))
curr_div = div.findAll('font', attrs={'size': '2'})
if curr_div[0] is not None:
row_data = []
# font -> strong -> font
title = curr_div[0].contents[0].contents[0].contents[0]
for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
row_data.append(str(row))
if 'Country' in str(title):
country_list.extend(row_data)
if 'Status' in str(title):
anonymity_list.extend(row_data)
for address, country, anonymity in zip(address_list, country_list, anonymity_list):
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(row):
proxy = "http://" + row
curr_proxy_list.append(proxy.__str__())
proxy_obj = self.create_proxy_object(address, country, anonymity)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Address with Invalid format: {}".format(row))
logger.debug("Proxy Invalid: {}".format(row))
# Usually these proxies are stale
if use_top15k:
# Parse 15k Nodes Text file (named *-all-*.txt)
content = requests.get(self.get_URl() + "/" + self.txt_proxy_path).content
content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
for link in table.findAll('a'):
current_link = link.get('href')
if current_link is not None and "all" in current_link:
self.txt_proxy_path = current_link
more_content = requests.get(self.get_URl()+self.txt_proxy_path).text
more_content = requests.get(self.get_url() + self.txt_proxy_path).text
for proxy_address in more_content.split():
if UrlParser.valid_ip_port(proxy_address):
curr_proxy_list.append(proxy_address)

proxy_obj = self.create_proxy_object(row)
curr_proxy_list.append(proxy_obj)
return curr_proxy_list

def create_proxy_object(self, address, country, anonymity):
# Make sure it is a Valid IP
ip = address.strip().split(":")[0]
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
port = address.strip().split(":")[1]
country = country.strip()
anonymity = AnonymityLevel.get(anonymity.strip())

return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

def __str__(self):
return "RebroWeebly Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
59 changes: 49 additions & 10 deletions http_request_randomizer/requests/parsers/SamairProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,30 @@
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


# Samair Proxy now renamed to: premproxy.com
class SamairProxyParser(UrlParser):
def __init__(self, web_url, timeout=None):
def __init__(self, id, web_url, timeout=None):
web_url += "/list/"
UrlParser.__init__(self, web_url, timeout)
UrlParser.__init__(self, id, web_url, timeout)

def parse_proxyList(self):
curr_proxy_list = []
# Parse all proxy pages -> format: /list/{num}.htm
# TODO: get the pageRange from the 'pagination' table
for page in range(1, 21):
response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout)
# Get the pageRange from the 'pagination' table
page_set = self.get_pagination_set()
logger.debug("Pages: {}".format(page_set))
for page in page_set:
response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
if not response.ok:
# Could not parse ANY page - Let user know
if not curr_proxy_list:
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
# Return proxies parsed so far
return curr_proxy_list
content = response.content
Expand All @@ -49,13 +52,49 @@ def parse_proxyList(self):
for row in table.find_all("tr")[1:]:
td_row = row.find("td")
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
proxy_obj = self.create_proxy_object(row)
# Make sure it is a Valid Proxy Address
if UrlParser.valid_ip_port(td_row.text):
curr_proxy_list.append('http://' + td_row.text)
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Address with Invalid format: {}".format(td_row.text))
logger.debug("Proxy Invalid: {}".format(td_row.text))
return curr_proxy_list

def get_pagination_set(self):
response = requests.get(self.get_url(), timeout=self.timeout)
page_set = set()
# Could not parse pagination page - Let user know
if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return page_set
content = response.content
soup = BeautifulSoup(content, "html.parser")
for ultag in soup.find_all('ul', {'class': 'pagination'}):
for litag in ultag.find_all('li'):
page_ref = litag.a.get('href')
# Skip current page '/list'
if page_ref.endswith(('htm', 'html')):
page_set.add(page_ref)
else:
page_set.add("")
return page_set

def create_proxy_object(self, row):
for td_row in row.findAll("td"):
if td_row.attrs['data-label'] == 'IP:port ':
text = td_row.text.strip()
ip = text.split(":")[0]
# Make sure it is a Valid IP
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
port = text.split(":")[1]
elif td_row.attrs['data-label'] == 'Anonymity Type: ':
anonymity = AnonymityLevel.get(td_row.text.strip())
elif td_row.attrs['data-label'] == 'Country: ':
country = td_row.text.strip()
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

def __str__(self):
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
.format(self.url, self.minimum_bandwidth_in_KBs)
Loading

0 comments on commit 3d3e37a

Please sign in to comment.