-
Notifications
You must be signed in to change notification settings - Fork 0
/
filteringdata.py
173 lines (151 loc) · 4.89 KB
/
filteringdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# coding=utf-8
import math
def manhattan(rating1, rating2):
"""Computes the Manhattan distance. Both rating1 and rating2 are dictionaries
of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
distance = 0
total = 0
for key in rating1:
if key in rating2:
distance += abs(rating1[key] - rating2[key])
total += distance
if total > 0:
return distance / total
else:
return -1 #Indicates no ratings in common
def euclidean(rating1, rating2):
"""Computes the Euclidean distance, which is the straight
line distance between two points on a plane. Both rating1 and rating2 are dictionaries
of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
distance = 0
commonRatings = False
for key in rating1:
if key in rating2:
distance += (rating1[key]-rating2[key])**2
commonRatings = True
if commonRatings:
return math.sqrt(distance)
else:
return -1 #Indicates no ratings in common
def minkowski(rating1, rating2, level=2):
"""Computes the Minkowski distance, which is a generalized form
for the Manhattan (level 1) and Euclidean (level 2) distances.
Both rating1 and rating2 are dictionaries
of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
distances = []
for key in rating1:
if key in rating2:
distance = math.pow(abs(rating1[key] - rating2[key]), level)
distances.append(distance)
if len(distances) > 0:
return math.pow(sum(distances), 1/level)
else:
return -1 #Indicates no ratings in common
def computeNearestNeighbor(username, users):
"""creates a sorted list of users based on their distance to username"""
distances = []
for user in users:
if user != username:
distance = minkowski(users[user], users[username], 1)
distances.append((distance, user))
# sort based on distance -- closest first
distances.sort()
return distances
def recommend(username, users):
"""Give list of recommendations"""
# first find nearest neighbor
nearest = computeNearestNeighbor(username, users)[0][1]
recommendations = []
# now find bands neighbor rated that user didn't
neighborRatings = users[nearest]
userRatings = users[username]
for artist in neighborRatings:
if not artist in userRatings:
recommendations.append((artist, neighborRatings[artist]))
recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse = True)
return recommendations
def pearson(rating1, rating2):
"""Determines the Pearson coefficient for two ratings"""
n = sum([1.0 for key in rating1 if key in rating2])
x_sum = sum([rating1[x] for x in rating1 if x in rating2])
y_sum = sum([rating2[y] for y in rating2 if y in rating1])
x_squares_sum = sum([math.pow(rating1[x],2.0) for x in rating1 if x in rating2])
y_squares_sum = sum([math.pow(rating2[y],2.0) for y in rating2 if y in rating1])
a = sum([(rating1[key] * rating2[key]) for key in rating1 if key in rating2])
b = (x_sum * y_sum) / n
c = math.sqrt(x_squares_sum - (math.pow(x_sum, 2.0) / n))
d = math.sqrt(y_squares_sum - (math.pow(y_sum, 2.0) / n))
if c*d == 0:
total = 0
else:
total = (a - b) / (c * d)
return total
def test():
users = {
"Angelica": {
"Blues Traveler": 3.5,
"Broken Bells": 2.0,
"Norah Jones": 4.5,
"Phoenix": 5.0,
"Slightly Stoopid": 1.5,
"The Strokes": 2.5,
"Vampire Weekend": 2.0
}, "Bill": {
"Blues Traveler": 2.0,
"Broken Bells": 3.5,
"Deadmau5": 4.0,
"Phoenix": 2.0,
"Slightly Stoopid": 3.5,
"Vampire Weekend": 3.0
}, "Chan": {
"Blues Traveler": 5.0,
"Broken Bells": 1.0,
"Deadmau5": 1.0,
"Norah Jones": 3.0,
"Phoenix": 5,
"Slightly Stoopid": 1.0
}, "Dan": {
"Blues Traveler": 3.0,
"Broken Bells": 4.0,
"Deadmau5": 4.5,
"Phoenix": 3.0,
"Slightly Stoopid": 4.5,
"The Strokes": 4.0,
"Vampire Weekend": 2.0
}, "Hailey": {
"Broken Bells": 4.0,
"Deadmau5": 1.0,
"Norah Jones": 4.0,
"The Strokes": 4.0,
"Vampire Weekend": 1.0
}, "Jordyn": {
"Broken Bells": 4.5,
"Deadmau5": 4.0,
"Norah Jones": 5.0,
"Phoenix": 5.0,
"Slightly Stoopid": 4.5,
"The Strokes": 4.0,
"Vampire Weekend": 4.0
}, "Sam": {
"Blues Traveler": 5.0,
"Broken Bells": 2.0,
"Norah Jones": 3.0,
"Phoenix": 5.0,
"Slightly Stoopid": 4.0,
"The Strokes": 5.0
}, "Veronica": {
"Blues Traveler": 3.0,
"Norah Jones": 5.0,
"Phoenix": 4.0,
"Slightly Stoopid": 2.5,
"The Strokes": 3.0
}}
print 'Testing Manhattan'
assert(manhattan(users['Hailey'], users['Veronica']) == 2.0)
print 'Testing Pearson'
assert(pearson(users['Angelica'], users['Bill']) == -0.90405349906826993)
assert(pearson(users['Angelica'], users['Hailey']) == 0.42008402520840293)
assert(pearson(users['Angelica'], users['Jordyn']) == 0.76397486054754316)
print 'Testing Pearson: PASS'
if __name__ == '__main__':
test()