-
Notifications
You must be signed in to change notification settings - Fork 0
/
Plots.py
147 lines (114 loc) · 4.7 KB
/
Plots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import math
import datetime
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.svm import SVR
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
import numpy as np
from ParseData.Database import DB
from Lib import loadData, getPointsOfInterest, generateAllFeatures, generateAllFeaturesExceptWeather
def plot(generateX, xLabel='x', yLabel='Taxi Pickups', includeFunc=None):
with DB() as db:
POIs = getPointsOfInterest()
numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1
# for hour in xrange(24):
plt.figure()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
fignum = 1
for POI in POIs:
print 'POI', POI
x, y = loadData(db, POI['LAT'], POI['LONG'], generateX, includeFunc=includeFunc)
plt.subplot(numRows, numCols, fignum)
plt.scatter(x, y)
plt.title(POI['NAME'])
plt.xlabel(xLabel)
plt.ylabel(yLabel)
fignum += 1
plt.show()
# Generate functions
def generateWeekday(db, latitude, longitude, time):
return time.weekday()
def generateHour(db, latitude, longitude, time):
return time.hour
def generateWeather(db, latitude, longitude, time):
weather = db.getWeather(time)
return weather.tornado
def generateNumPickupsBefore(db, latitude, longitude, time):
return db.getNumPickupsNearLocation(latitude, longitude, time - datetime.timedelta(hours=2), time - datetime.timedelta(hours=1))
def generateNumPickupsAfter(db, latitude, longitude, time):
return db.getNumPickupsNearLocation(latitude, longitude, time + datetime.timedelta(hours=3), time + datetime.timedelta(hours=4))
def generateAfterEvent(db, latitude, longitude, time):
return db.afterNumEvents(latitude, longitude, time, 3)
def generateDuringEvent(db, latitude, longitude, time):
# timedelta = datetime.timedelta(hours=1)
return db.isDuringEvent(latitude, longitude, time)
# Include functions
def includeWeekday(weekday):
return lambda db, latitude, longitude, time: time.weekday() == weekday
def includeHour(hour):
return lambda db, latitude, longitude, time: time.hour == hour
# Feature Selection
def featureSelection():
with DB() as db:
POIs = getPointsOfInterest()
numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1
# for hour in xrange(24):
plt.figure()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
fignum = 1
for POI in POIs:
print POI
x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeaturesExceptWeather)
x, y = np.array(x), np.array(y)
###############################################################################
width = 0.6
x_indices = np.arange(x.shape[-1])
###############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(f_regression, percentile=10)
selector.fit(x, y)
scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
plt.subplot(numRows, numCols, fignum)
plt.bar(x_indices-(width/2), scores, width=width, color='g')
plt.title(POI['NAME'])
plt.xlabel('Feature number')
plt.ylabel('Univariate score ($-Log(p_{value})$)')
plt.xticks(x_indices)
plt.axis('tight')
plt.legend(loc='upper right')
fignum += 1
plt.show()
def recursiveFeatureElimination():
with DB() as db:
POIs = getPointsOfInterest()
numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1
# for hour in xrange(24):
plt.figure()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
fignum = 1
for POI in POIs:
x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures)
x, y = np.array(x), np.array(y)
# Create the RFE object and compute a cross-validated score.
svr = SVR(kernel="linear")
rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
rfecv.fit(x, y)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.subplot(numRows, numCols, fignum)
plt.title(POI['NAME'])
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of misclassifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
fignum += 1
plt.show()
if __name__ == '__main__':
# plot(generateWeekday, xLabel='Day of week')
# plot(generateAfterEvent, xLabel='Num of ongoing events')
featureSelection()
# with DB() as db:
# for POI in getPointsOfInterest():
# print len(db.getXClosestStations(POI['LAT'], POI['LONG'], 3, True))