-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
152 lines (131 loc) · 6.83 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 26 09:24:46 2016
@author: Julius
"""
import numpy as np
import pandas as pd
import urllib2
import math
from fastnumbers import isfloat
from collections import OrderedDict
from bs4 import BeautifulSoup
import re
import sys
teams_abbr = ['atl', 'bkn', 'bos', 'cha', 'chi', 'cle', 'dal', 'den', 'det', \
'gs', 'hou', 'ind', 'lac', 'lal', 'mem', 'mia', 'mil', 'min', 'nop', 'nyk', \
'okc', 'orl', 'phi', 'phx', 'por', 'sac', 'sa', 'tor', 'utah', 'was']
teams_full_name = ['atlanta-hawks', 'brooklyn-nets', 'boston-celtics', 'charlotte-hornets', \
'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets', \
'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers', \
'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat', \
'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks', \
'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns', \
'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors', \
'utah-jazz', 'washington-wizards']
team_abbr_full = dict(zip(teams_abbr, teams_full_name))
champion_years = np.arange(1991, 2016)
champions = ['chi', 'chi', 'chi', 'hou', 'hou', 'chi', 'chi', 'chi', 'sa', 'lal', 'lal', 'lal', \
'sa', 'det', 'sa', 'mia', 'sa', 'bos', 'lal', 'lal', 'dal', 'mia', 'mia', 'sa', 'gs']
champions = dict(zip(champion_years, champions))
def crawl_nbastats_champ_by_years(years, num_player_of_interest=10):
for year in years:
champion_teamstats = crawl_nbastats_by_year(year=year, \
champion_team_name=champions[year])
print '---------' + str(year) + ' Champ: ' + team_abbr_full[champions[year]] + '---------'
# keep the first 10 rows of players for factor analysis purpose later
if champion_teamstats.shape[0] >= num_player_of_interest:
champion_teamstats = champion_teamstats.iloc[0:num_player_of_interest]
#print champion_teamstats
filename = str(year) + '_' + team_abbr_full[champions[year]] + '.csv'
champion_teamstats.to_csv(filename)
else:
print 'warning (stats not saved): # of players is less than 10 on year''s champ team!'
def crawl_nbastats_by_year(year, champion_team_name='dal', num_player_of_interest=10):
champion_team_name = team_abbr_full[champion_team_name]
url_root = 'http://espn.go.com/nba/team/stats/_/name/'
best_stats = []
champion_teamstats = pd.DataFrame()
non_champion_teamstats = []
non_champion_teamlist = []
for team_abbr, team_name in zip(teams_abbr, teams_full_name):
catogory = '/cat/avgMinutes/' # ordering player with their avg. minutes
URL = url_root + team_abbr + '/year/' + str(year) + category + team_name
print 'parsing ' + URL + ' ...'
request = urllib2.Request(URL)
response = urllib2.urlopen(request)
if response.url != URL:
print 'no response on this address, redirect to: ', response.url
continue
response = response.read()
soup = BeautifulSoup(response, 'html.parser')
players = soup.findAll('tr', {'class': re.compile('^player-')})
stat_labels = soup.findAll('tr', {'class': ['colhead']})
total_labels = soup.findAll('tr', {'class': ['total']})
print soup.title.string
#print '1: ', total_labels[0].select('td')
#print '2: ', total_labels[1].select('td')
player_list = []
player_dict = {}
team_stats = OrderedDict() # avoid dict sorting the keys when adding them
# Initialise 30 statistics for the team
stats = ['', '']
stats[0] = stat_labels[0].select('td') # Table 1: game statistics
stats[1] = stat_labels[1].select('td') # Table 2: shooting statistics
stat_labels = stats
for stat in stats[0]:
team_stats[stat.get_text()] = 0.0
for stat in stats[1]:
team_stats[stat.get_text()] = 0.0
numOfPlayer = len(players) / 2
# teams with players fewer than 10 are not included in the study
if numOfPlayer < num_player_of_interest:
print 'warning: players less than ' + str(num_player_of_interest) + ' !'
continue
player_namelist = []
for i, player in enumerate(players, 0):
if i == numOfPlayer:
break
player_stats = player.findAll('td')
player_namelist.append(player_stats[0].get_text().encode('ascii', 'ignore'))
team_stats = pd.DataFrame(np.zeros([numOfPlayer, len(team_stats.keys())]), \
index=player_namelist, columns=team_stats.keys())
team_stats = team_stats.drop('PLAYER', 1)
for i, player in enumerate(players, 0):
player_idx = i % numOfPlayer
j = i / numOfPlayer
player_stats = player.findAll('td') # iterate over players within a team
stat = np.zeros(len(player_stats))
for stat_label, player_stat in zip(stat_labels[j], player_stats):
x = player_stat.get_text().encode('ascii', 'ignore')
if isfloat(x) == True:
x = float(x)
team_stats.set_value(player_namelist[player_idx], stat_label.get_text(), x)
'''filename = team_name + '_' + str(year) + '.csv'
print 'saving ' + filename, ' ...'
team_stats.to_csv(filename)'''
# keep track of champion team with specified year
team_stats.index.name = 'Players'
team_stats.columns.name = 'Statistics'
if team_name == champion_team_name:
champion_teamstats = team_stats
else:
non_champion_teamstats.append(team_stats)
non_champion_teamlist.append(team_name)
# keep track of the best of each statistics
if len(best_stats) == 0:
best_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
else:
team_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
# only take max if all the entries in 'team_stats' are non-nan
if not np.isnan(team_stats).any():
best_stats = np.maximum(best_stats, team_stats) # element-wise max
# normalise the stats by dividing the champion team's stats by the best stats among all teams
if (not champion_teamstats.empty) and (len(best_stats) != 0):
champion_teamstats = champion_teamstats.loc[:, 'GP'::].divide(best_stats, axis='columns')
for team_stat, team_name in zip(non_champion_teamstats, non_champion_teamlist):
team_stat = team_stat.loc[:, 'GP'::].divide(best_stats, axis='columns')
team_stat = team_stat.iloc[0:num_player_of_interest]
filename = 'non_champions/' + str(year) + '_' + team_name + '.csv'
team_stat.to_csv(filename)
return champion_teamstats