-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheacu.py
69 lines (61 loc) · 2.27 KB
/
eacu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import urllib2, HTMLParser, cookielib
from BeautifulSoup import BeautifulSoup
from parse import parse, print_headers
#import requests
site = "http://www.c4results.org.uk"
siteurl = "http://www.c4results.org.uk/chess/php/"
top_url = "http://www.c4results.org.uk/chess/php/index.php"
dbversion = { 'DBVersion': '2014-15' }
eacu_teams = ['1st Team', 'U160']
class SiteAccessor(object):
def __init__(self):
self.cookies = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookies))
def follow_url(self, url):
self.opener.open(url)
def load_url(self, base):
if base.find('?') >= 0:
url = base + '&'
else:
url = base + '?'
for p in dbversion:
url = url + p + '=' + dbversion[p]
webpage = self.opener.open(url)
top_page = BeautifulSoup(webpage.read())
return top_page
accessor = SiteAccessor()
#def start_new():
# resp = requests.get(thisurl, params=dbversion)
# print resp.cookies
# top_page = BeautifulSoup(resp.text)
# return top_page
top_page = accessor.load_url(top_url)
for league in top_page.findAll('tr'):
columns = league.findAll('td')
if len(columns) > 0 and columns[0].contents[0].startswith('EACU'):
#for bit in columns:
# print bit.contents
#print
href = columns[1].contents[0]['href']
#print 'First Href', href
#page = BeautifulSoup(urllib2.urlopen(site + href).read())
page = accessor.load_url(site + href)
rows = page.findAll('tr')
for row in rows:
cells = row.findAll('td')
if len(cells) > 0:
anchors = cells[0].findAll('a')
if len(anchors) > 0:
if anchors[0].contents[0].startswith('EACU County'):
href = anchors[0]['href']
#print 'Second Href', href
eacupage = accessor.load_url(siteurl + href)
headers = True
for link in eacupage.findAll('a'):
for team in eacu_teams:
if link.contents[0].endswith(team):
league_page = accessor.load_url(site + link['href']).findAll('table')[0]
if headers:
print_headers(league_page)
headers = False
parse(team, league_page)