-
Notifications
You must be signed in to change notification settings - Fork 0
/
contentBasedRecommender.py
112 lines (79 loc) · 3.86 KB
/
contentBasedRecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import fileLoad as fl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Extract the data
reviews = fl.read_json('material/instruments.json')
text = []
user = []
prod = []
scor = []
for review in reviews:
text.append(review['reviewText'])
user.append(review['reviewerID'])
prod.append(review['asin'])
scor.append(review['overall'])
# Combine the review for each product and each user
product_reviews_dict = {}
user_reviews_dict = {}
for x in range(0, len(text)):
if prod[x] in product_reviews_dict:
product_reviews_dict[prod[x]] += " " + text[x] + " . "
else:
product_reviews_dict[prod[x]] = text[x] + " . "
if user[x] in user_reviews_dict:
user_reviews_dict[user[x]] += " " + text[x] + " . "
else:
user_reviews_dict[user[x]] = text[x] + " . "
kurt_robair_id = "AMNTZU1YQN1TH"
kurt_robairs_reviews = ["B00004Y2UT", "B00006LVEU", "B0002E1H9W", "B0002M728Y"
, "B0002M72JS", "B0006NDF8A", "B000EEHKVY", "B0018TIADQ", "B001FQ74FW", "B003JJQMD8"]
product_reviews = list(product_reviews_dict.values())
user_reviews = list(user_reviews_dict.values())
# A combination of text from all Kurt Robairs reviews
kurt_robairs_collected_reviews = user_reviews_dict[kurt_robair_id]
# A Matrix where each row is a vector describing a review in terms of tf_idf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([kurt_robairs_collected_reviews] + product_reviews)
# id | term_1 | term_2 | ...
# kurt | 1.5 | 2.3 | ... <- tf_idf values
# prod1 | 0.7 | 1.3 | ...
# prod2 | 0.3 | 2.1 | ...
# The cosine similarity between Kurt Robairs combined reviews and all other products' combined reviews
product_scores = cosine_similarity(tfidf_matrix[0], tfidf_matrix)[0][1:]
# Zip score together with product id. Then sort desc on score.
ordered_products_with_scores = sorted(list(zip(product_reviews_dict.keys(), product_scores)), key = lambda tup : tup[1], reverse = True)
# Find products that Kurt Robairs hasn't reviewed
recommended_products = [p for p in ordered_products_with_scores if p[0] not in kurt_robairs_reviews]
# Show top 10 products with scores, recommended for Kurt Robair
for kvp in recommended_products[:10]:
print(f"Id: {kvp[0]}, Score: {kvp[1]}")
# Kurt's Recommendations: 7 * 5.0, 3 * 3.0
# He only reviews items he like. So we can use his reviews as a basis for finding other items!
# We should filtered his 4+ reviews, and only used those.
def pseudo_code():
# MOVIIES (id, title, length in minutes, genres, budget $, date)
lotr = (0, "Lord of the Rings: The King Returns", 200, ["fantasy", "drama"], 94000000, "17/12-2003")
nights = (1, "All these Sleeping Nights", 100, ["documentary", "drama"], 100000, "04/11-2016")
kasper1 = (_, "Batman Rings Killing Softly", 92, ["crime", "action", "romance"], 40000000, "06/05-1994")
kasper2 = (_, "Money Wolf Never Sleeps", 101, ["mocumentary", "documentary"], 1000000, "17/01-2019")
dist_title = "Cosine similarity"
dist_length = "mse"
dist_genres = "Jaccard"
dist_budget = "mse"
dist_date = "mse"
dist_weight = [0.3, 0.2, 0.1, 0.2, 0.3]
def dist(a, b):
# distances between a and b of course.
distances = [dist_title(a, b), dist_length(a, b), dist_genres(a, b), dist_budget(a, b), dist_date(a, b)]
weighted_dist = np.dot(distances, dist_weight)
return weighted_dist
def get_recommendations(u, k):
centroids = cluster_movies_for(u)
movie_count = len(movies)
predictions = np.ones(movie_count) * float('inf')
for c in centroids:
for i in range(movie_count):
d = dist(c, movies[i])
if d < predictions[i]:
predictions[i] = d
return sorted_on_snd(zip(range(movie_count), predictions))[:k]