-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
117 lines (104 loc) · 3.59 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from bs4 import BeautifulSoup
import re
base_url = 'http://www.pro-football-reference.com'
COL_NAMES = ['week', 'day', 'date', 'boxscore_url', 'result', 'OT', 'record',
'at', 'opponent', 'team_score', 'opp_score', 'off_first_downs',
'off_total_yds', 'off_pass_yds', 'off_rush_yds',
'off_turn_overs', 'def_first_downs', 'def_total_yds',
'def_pass_yds', 'def_rush_yds', 'def_turn_overs']
def _strip_html(text):
"""
Strips tags from HTML
:param text: The text to strip from
:return: Cleaned text
"""
tag_re = re.compile(r'<[^>]+>')
return tag_re.sub('', str(text))
def parse_boxscore_url(url_tag):
"""
Parses the URL from the boxscore tag
:param url_tag: The url tag
:return: The boxscore url
"""
soup = BeautifulSoup(url_tag)
return soup.find_all('a', href=True)[0]['href']
def parse_season_log(team, verbose_name, year, csv=False):
"""
Parses the season log
:param team: The team abbreviation
:param verbose_name: The full name of the team
:param year: The year to fetch
:param csv: Boolean flag to output in csv or in JSON
:return: List of data in JSON | csv
"""
season_url = \
'http://www.pro-football-reference.com/teams/%s/%s.htm' % (team, year)
res = requests.get(season_url)
# PFR will redirect you to a 404 page if not found
# but still serves you a 200 response
if '404' in res.url:
raise Exception('No data found for team %s in year %s' % (team, year))
soup = BeautifulSoup(res.text)
parsed = soup.findAll(
'div', {
'class': 'table_container',
'id': 'div_team_gamelogs'
}
)
rows = parsed[0].findAll('td')
# After 1994, they added 3 more columns
column_len = 24 if int(year) >= 1994 else 21
# Group the rows into the number of columns
grouped_rows = [
rows[i:i+column_len] for i in range(0, len(rows), column_len)
]
data = []
for row in grouped_rows:
# If there is no day of week then it was the bye week
if _strip_html(row[1]) == '':
continue
box_score_uri = parse_boxscore_url(
str(row[COL_NAMES.index('boxscore_url')]))
row = map(lambda x: _strip_html(x), row)
# Add the year into the data
if csv:
row.insert(0, str(year))
row.insert(1, verbose_name)
row[COL_NAMES.index('boxscore_url') + 1] = box_score_uri
data.append(','.join(row))
else:
row_dict = dict(zip(COL_NAMES, row))
row_dict['boxscore'] = box_score_uri
data.append(row_dict)
return data
def parse_log_table(html, div_id):
"""
Parses a log table
:param html: The HTML to parse
:param div_id: The id of the table div
:return: List of data
"""
soup = BeautifulSoup(html)
parsed = soup.findAll(
'div', {
'class': 'table_container',
'id': div_id
}
)
rows = parsed[0].findAll('td')
table = map(lambda x: _strip_html(x), rows)
# The table comes back as a list with keys and values in sequence.
# [key1, val1, key2, val2, ...] This convert that list into a dict
return dict(zip(*[iter(table)]*2))
def parse_game_info_log(gamelog_url):
"""
Parses the boxscore gamelogs
:param gamelog_url: The gamelog URL
:return: A dict containing game and ref info
"""
res = requests.get(gamelog_url)
return {
'game_info': parse_log_table(res.text, 'div_game_info'),
'ref_info': parse_log_table(res.text, 'div_ref_info')
}