-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
172 lines (142 loc) · 6.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import streamlit as st
import requests
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
ps = PorterStemmer()
# ___FUNCTIONS___
# stemmer function
def stem(text):
y = []
for i in text.split():
y.append(ps.stem(i))
return " ".join(y)
# To find the common values in 2 lists
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
# function to find imdbId for posters
def get_tmdb_id(lst, movies_list, links_list):
movies_list = movies_list.merge(links_list, on="movieId", how="outer")
movies_list.fillna(0)
result1 = []
for itr in lst:
for j in range(len(movies_list['title'])):
if itr == movies_list.loc[j]['title']:
result1.append(movies_list.loc[j]['tmdbId'])
return result1
# function to fetch posters from tmdb
# tmdb api :- https://www.themoviedb.org/documentation/api
def fetch_poster(movie_id):
url = "https://api.themoviedb.org/3/movie/{}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US".format(
movie_id)
data = requests.get(url)
data = data.json()
poster_path = data['poster_path']
full_path = "https://image.tmdb.org/t/p/w500/" + poster_path
return full_path
# collaborative filtering basics : https://developers.google.com/machine-learning/recommendation/collaborative/basics
def collaborative_rec(movie, movie_titles, rating_input):
rating_input = pd.merge(rating_input, movie_titles, on='movieId')
moviemat = rating_input.pivot_table(index='userId', columns='title', values='rating')
ratings = pd.DataFrame(rating_input.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(rating_input.groupby('title')['rating'].count())
user_ratings = moviemat[movie]
# creating correlation series/list
similar_list = moviemat.corrwith(user_ratings)
corr_df = pd.DataFrame(similar_list, columns=['Correlation'])
corr_df.dropna(inplace=True)
corr_df = corr_df.join(ratings['num of ratings'])
# only movies with number of ratings > 100 are finalised to prevent recommending movies with less ratings
rst = corr_df[corr_df['num of ratings'] > 100].sort_values('Correlation', ascending=False).index
result = []
for i in range(len(rst)):
result.append(rst[i])
return result
# content based filtering basics : https://developers.google.com/machine-learning/recommendation/content-based/basics
def content_based_rec(movie, movies, tags):
movies = movies.merge(tags, on="movieId", how="outer")
movies.tag = movies.tag.fillna('')
movies['tag'] = movies['tag'] + " " + movies['genres']
movies['tag'] = movies['tag'].str.replace('|', ' ', regex=True)
movies = movies[['movieId', 'tag', 'title']]
# converting tags into proper format for vectorisation
movies['tag'] = movies.groupby(['movieId'])['tag'].transform(lambda x: ' '.join(str(v) for v in x))
movies = movies.drop_duplicates()
movies = movies.reset_index()
movies['tag'] = movies['tag'].apply(lambda x: x.lower())
movies['tag'] = movies['tag'].apply(stem)
# applying vectorisation and cosine similarity from sklearn lib
# to learn more : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tag']).toarray()
similarity = cosine_similarity(vectors)
movie_index = movies[movies['title'] == movie].index[0]
distances = similarity[movie_index]
# order is lost during sorting, so we enumerate it to keep the index
movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:101]
result = []
for i in movie_list:
result.append(movies.iloc[i[0]].title)
return result
# _______________________________________________________________________________________________________________
# input
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')
rating_csv = pd.read_csv('ratings.csv')
tags_csv = pd.read_csv('tags.csv')
# UI using streamlit
# streamlit documentation : https://docs.streamlit.io/library/api-reference
st.set_page_config(layout="wide")
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.title('Microsoft Engage : Movie recommendation system ')
select_movie = st.selectbox('Movie Name here!', movies['title'].values, key='mix')
recommendation_collab = collaborative_rec(select_movie, movies, rating_csv)
recommendation_content = content_based_rec(select_movie, movies, tags_csv)
if st.button('Recommend'):
result = intersection(recommendation_collab, recommendation_content)
for i in range(5 - len(result)):
result.append(recommendation_content[i])
tmdb_id = get_tmdb_id(result, movies, links)
recommended_movie_posters = []
for movie_id in tmdb_id:
recommended_movie_posters.append(fetch_poster(movie_id))
# OUTPUT
st.markdown("<h1 style='text-align: center; color: #a0fbff;'>Our Recommendations</h1>", unsafe_allow_html=True)
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.text(result[0])
st.image(recommended_movie_posters[0])
with col2:
st.text(result[1])
st.image(recommended_movie_posters[1])
with col3:
st.text(result[2])
st.image(recommended_movie_posters[2])
with col4:
st.text(result[3])
st.image(recommended_movie_posters[3])
with col5:
st.text(result[4])
st.image(recommended_movie_posters[4])
col1, col2 = st.columns(2)
with col1:
st.write('______________________________________________')
st.subheader('Movies loved by similar users')
for i in range(5):
if i >= len(recommendation_collab):
st.write('---- lack of user ratings ----')
break
st.write(recommendation_collab[i])
with col2:
st.write('______________________________________________')
st.subheader('Movies with similar content')
for i in range(5):
st.write(recommendation_content[i])