-
Notifications
You must be signed in to change notification settings - Fork 0
/
hybrid.py
329 lines (207 loc) · 8.46 KB
/
hybrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env python
# coding: utf-8
# Import required packages
# In[1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle
import sys
from pathlib import Path
# Check the validation of dataset format
# In[2]:
# Receive the user name from app.py
user_input = sys.argv[1]
# Get ui_dataset from pickle
with open('./pickle/ui_dataset.pickle', 'rb') as f:
ui_dataset = pickle.load(f)
BASE_DIR = Path(__file__).resolve(strict=True).parent
ui_dataset_file_name = './uploads/' + ui_dataset
my_user_item = pd.read_excel(ui_dataset_file_name)
# Count the number of courses
# In[3]:
course_counts = pd.DataFrame(my_user_item)['course'].value_counts()
courses = pd.Series(course_counts.index)
courses = courses.sort_values().set_axis(range(0,len(courses)))
number_of_courses = len(course_counts)
# Load matrices
# In[4]:
folder_path = './pickle'
with open(f'{folder_path}/knn_matrix.pickle', 'rb') as f:
knn_matrix = pickle.load(f)
with open(f'{folder_path}/tfidf_matrix.pickle', 'rb') as f:
tfidf_matrix = pickle.load(f)
# Combine knn_matrix and tfidf_matrix together
# In[5]:
from sklearn.preprocessing import normalize
from scipy.sparse import hstack
# Set the weights
tfidf_weight = 0.3
knn_weight = 0.7
# Normalize the matrices
tfidf_matrix_normalized = normalize(tfidf_matrix) * tfidf_weight
knn_matrix_normalized = normalize(knn_matrix) * knn_weight
# Combined the normalized metrices
combined_matrix = hstack((tfidf_matrix_normalized, knn_matrix_normalized))
# Euclidean Distance & Cosine Similarity
# In[6]:
model_combined = NearestNeighbors(metric='cosine', algorithm='brute').fit(combined_matrix)
'''This will create a new matrix where each row is the concatenation of the
corresponding rows from tfidf_matrix and knn_matrix. The NearestNeighbors model
is then fit on this combined matrix.'''
# Export model_combined to pickle file
with open('./pickle/model_combined.pickle', 'wb') as f:
pickle.dump(model_combined, f)
# Recommender function using hybrid model
# In[7]:
# def recommender_hybrid(course_name):
# n_recommendations = int( number_of_courses ** (1/2) )
# idx = process.extractOne(course_name, courses)[2]
# distances, indices = model_combined.kneighbors(combined_matrix[idx], n_neighbors=n_recommendations+1, return_distance=True)
# recommendations = [courses[i].where(i!=idx) for i in indices]
# recommended_courses = recommendations[0][1:]
# course_distances = distances[0][1:]
# d = {
# 'Course': recommended_courses,
# 'Cosine Distance': course_distances
# }
# results = pd.DataFrame(data=d)
# n_distance = results['Cosine Distance'].where(results['Cosine Distance'] < 0.8).count()
# return results.head(n_distance)
# KNN Recommender function for all courses
# In[8]:
# def recommender_knn_all_courses(course_name):
# idx = process.extractOne(course_name, courses)[2]
# # print('Selected movie:', courses[idx], 'Index:', idx)
# distances, indices = model_combined.kneighbors(knn_matrix[idx], n_neighbors=len(courses))
# recommendations = [courses[i].where(i!=idx) for i in indices]
# recommended_courses = recommendations[0][1:]
# scores = 1 - distances
# course_distances = scores[0][1:]
# d = {
# 'Course': recommended_courses,
# 'Score': course_distances
# }
# results = pd.DataFrame(data=d)
# results = results.sort_index().rename_axis('Index')
# return results
# KNN Recommender function using username
# In[9]:
# def recommender_knn_by_user(user_name, n_recommendations):
# df = {
# 'User': pd.Series(file['username']),
# 'Course': pd.Series(file['course'])
# }
# user_course = pd.DataFrame(df)
# selected_user_name = user_course.loc[user_course['User'] == user_name]
# selected_courses = selected_user_name['Course']
# recommended_courses = [ recommender_knn_all_courses(x) for x in selected_courses]
# # pre dataframe
# df = pd.DataFrame({
# 'Course': [],
# 'Score': []
# }).rename_axis('Index')
# for x in recommended_courses:
# df = df._append(x)
# df = df.sort_values('Score', ascending=False).drop_duplicates('Course')
# return df.head(n_recommendations)
# Hybrid Recommender function for all courses
# In[10]:
def recommender_hybrid_all_courses(course_name):
idx = process.extractOne(course_name, courses)[2]
# print('Selected movie:', courses[idx], 'Index:', idx)
distances, indices = model_combined.kneighbors(combined_matrix[idx], n_neighbors=len(courses))
recommendations = [courses[i].where(i!=idx) for i in indices]
recommended_courses = recommendations[0][1:]
scores = 1 - distances
course_distances = scores[0][1:]
d = {
'Course': recommended_courses,
'Score': course_distances
}
results = pd.DataFrame(data=d)
results = results.sort_index().rename_axis('Index')
return results
# Hybrid Recommender function using username
# In[11]:
def recommender_hybrid_by_user(user_name):
n_recommendations = 10
df = {
'User': pd.Series(my_user_item['username']),
'Course': pd.Series(my_user_item['course'])
}
user_course = pd.DataFrame(df)
selected_user_name = user_course.loc[user_course['User'] == user_name]
selected_courses = selected_user_name['Course']
recommended_courses = [ recommender_hybrid_all_courses(x) for x in selected_courses]
# Remove the rows with the selected courses
recommended_courses = [x[~x['Course'].isin(selected_courses)] for x in recommended_courses]
# pre dataframe
df = pd.DataFrame({
'Course': [],
'Score': []
}).rename_axis('Index')
for x in recommended_courses:
df = df._append(x)
df = df.sort_values('Score', ascending=False).drop_duplicates('Course')
return df.head(n_recommendations)
# Items show permanence whereas, people change with time
# Items are fewer in numbers to deal with. Which leads to smaller similarity matrix. Amazon and Netflix use it!
# Better for New users:
# — Him selecting just one item will let us provide recommendations
# — But for user based, new user has to wait until next build of similarity matrix (which is the only computational part of the framework)
# In[13]:
print(recommender_hybrid_by_user(user_input).to_html(index=False))
# Training and Testing part
# In[14]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# k nearest neighbors
# Predata of training
# In[15]:
with open(f'{folder_path}/knn_X.pickle', 'rb') as f:
knn_X = pickle.load(f)
with open(f'{folder_path}/knn_y.pickle', 'rb') as f:
knn_y = pickle.load(f)
# Split the train and the test
# In[16]:
# Use the train test split function
X_train, X_test, y_train, y_test = train_test_split(
knn_X, knn_y, random_state=42, test_size=0.25
)
# In[17]:
regressor = KNeighborsRegressor(n_neighbors=3)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
# In[18]:
# print("Mean Squared Error:", mean_squared_error(predictions,y_test) / 4)
# print("Mean Absolute Error:", mean_absolute_error(predictions,y_test) / 4)
# Term Frequency and Inverse Document Frequency
# Predata of training
# In[19]:
with open(f'{folder_path}/tfidf_X.pickle', 'rb') as f:
tfidf_X = pickle.load(f)
with open(f'{folder_path}/tfidf_y.pickle', 'rb') as f:
tfidf_y = pickle.load(f)
# Split the train and the test
# In[20]:
# Split the train test data
X_train, X_test, y_train, y_test = train_test_split(tfidf_X, tfidf_y, test_size=0.2, random_state=42)
# Fit model selection
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
predictions = naive_bayes.predict(X_test)
# Convert the output to a boolean number
predictions = [ 1 if x == 'recommended' else 0 for x in predictions]
y_test = [ 1 if x == 'recommended' else 0 for x in y_test]
# Measure the performance for our parameters
# In[21]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
# print(f'Accuracy: {accuracy} \nPrecision: {precision}, \nRecall: {recall}, \nF1: {f1}')
# %%