diff --git a/src/data/cleaning_data.py b/src/data/cleaning_data.py index 0a44927..91eb2b2 100644 --- a/src/data/cleaning_data.py +++ b/src/data/cleaning_data.py @@ -1,4 +1,5 @@ from src.utils.initialize import * +import pprint # make sure there are same number of distinct genres in movies_with_overviews @@ -7,15 +8,21 @@ # cleaning # load no_duplicate_movies -with open('data/interim/no_duplicate_movies.pkl','rb') as f: - no_duplicate_movies=pickle.load(f) +# # print("Loading the list of de-duped movies from data/interim/no_duplicate_movies.pkl...") +# with open('data/interim/no_duplicate_movies.pkl','rb') as f: +# no_duplicate_movies=pickle.load(f) +# print("Loaded the list of de-duped movies from data/interim/no_duplicate_movies.pkl.") + +# print("Loading the list of movies that have overviews from data/interim/movies_with_overviews.pkl...") with open('data/interim/movies_with_overviews.pkl','rb') as f: movies_with_overviews=pickle.load(f) +print("Loaded the list of movies that have overviews from data/interim/movies_with_overviews.pkl.\n") # Y # list of genres and movie ids in prep for binarizination +print("Extracting the genres and movie ids in prep for binarizination...") genres=[] all_ids=[] for i in range(len(movies_with_overviews)): @@ -26,14 +33,17 @@ all_ids.extend(genre_ids) # binarize the genres for each movie +print('Binarizing the list of genres to create the target variable Y.') from sklearn.preprocessing import MultiLabelBinarizer mlb=MultiLabelBinarizer() Y=mlb.fit_transform(genres) - +print("Done! Y created. Shape of Y is ") print (Y.shape) +print('\n') # tmdb package provides a method that will propvide a dictionary that maps genre ids to genre name. # we may need to add something if that list is incorrect. +print("Creating a mapping from the genre ids to the genre names...") genres=tmdb.Genres() # the movie_list() method of the Genres() class returns a listing of all genres in the form of a dictionary. list_of_genres=genres.movie_list()['genres'] @@ -47,29 +57,39 @@ print(i) if i == 10769: Genre_ID_to_name[10769]="Foreign" # look up what the above genre ids are. see if there's a programmatic way to do it +print("Mapping from genre id to genre name is saved in the Genre_ID_to_name dictionary:") +pprint.pprint(Genre_ID_to_name, indent=4) +print('\n') +# import re - -import re - -# remove some punctuation. probably a much better way to do this -content=[] -for i in range(len(movies_with_overviews)): - movie=movies_with_overviews[i] - id=movie['id'] - overview=movie['overview'] - overview=overview.replace(',','') - overview=overview.replace('.','') - content.append(overview) +# # remove some punctuation. probably a much better way to do this +# content=[] +# for i in range(len(movies_with_overviews)): +# movie=movies_with_overviews[i] +# id=movie['id'] +# overview=movie['overview'] +# overview=overview.replace(',','') +# overview=overview.replace('.','') +# content.append(overview) import pickle -with open('data/processed/Y.pkl','wb') as f: - pickle.dump(Y,f) +# print('Saving the mapping from genre id to genre name as data/processed/Genredict.pkl...') with open('data/processed/Genredict.pkl','wb') as f: pickle.dump(Genre_ID_to_name,f) -with open('data/processed/movies_with_overviews.pkl','wb') as f: - pickle.dump(movies_with_overviews,f) +print('Saved the mapping from genre id to genre name as data/processed/Genredict.pkl.') + +# print("Saving the target variable Y to data/processed/Y.pkl...") +with open('data/processed/Y.pkl','wb') as f: + pickle.dump(Y,f) +print("Saved the target variable Y to data/processed/Y.pkl.\n") +print('\tHere are the first few lines of Y:') +print('\t'+str(Y[:5])) + + +# with open('data/processed/movies_with_overviews.pkl','wb') as f: +# pickle.dump(movies_with_overviews,f) diff --git a/src/data/movie_list.py b/src/data/movie_list.py index 513cb6f..bc97834 100644 --- a/src/data/movie_list.py +++ b/src/data/movie_list.py @@ -1,28 +1,33 @@ from src.utils.initialize import * +import pprint # Get the text data from top 1000 popular movies ######## all_movies=tmdb.Movies() top_movies=all_movies.popular() # TODO parameterize by making top N movies -all_movies=tmdb.Movies() top1000_movies=[] -print('Pulling movie list, Please wait...') +print('Pulling movie list of popular movies, Please wait...') +print('\tWhile you wait, here are some sampling of the movies that are being pulled...') for i in range(1,51): if i%10==0: + print('\t' + str(i) + '/51 done') + print('\t******* Waiting a few seconds to stay within rate limits of TMDB... *******)') time.sleep(7) - print(str(i)+'/51...') movies_on_this_page=all_movies.popular(page=i)['results'] + print('\t\t'+movies_on_this_page[-1]['title']) top1000_movies.extend(movies_on_this_page) len(top1000_movies) -print('Done!') - +print('Done! Pulled a list of the top {n} movies.'.format(n = len(top1000_movies))) +print('\n') +print('Extracting the genre ids associated with the movies....') genre_ids_ = list(map(lambda x: x['genre_ids'], top1000_movies)) genre_ids_ = [item for sublist in genre_ids_ for item in sublist] nr_ids = list(set(genre_ids_)) - +print('Done! We have identified {n} genres in the top {m} most popular movies.'.format(n=len(nr_ids), m=len(top1000_movies))) +print('\n') ############################## # Get poster data from another sample of movies from the genres listed in the top 1000 movies for a specific year ################# @@ -33,10 +38,10 @@ movies = [] baseyear = 2017 -print('Starting pulling movies from TMDB. This will take a while, please wait...') +print('Starting pulling movies from TMDB from each genre. This will take a while, please wait...') done_ids=[] for g_id in nr_ids: - print('Pulling movies for genre ID '+str(g_id)) + print('\tPulling movies for genre ID {g_id}. Here are sample of movies in the genre: '.format(g_id = str(g_id)) ) baseyear -= 1 for page in range(1,6,1): # (1,6,1) time.sleep(1) @@ -49,14 +54,17 @@ dataDict = json.loads(data) movies.extend(dataDict["results"]) + last_movies = list(map(lambda x: x['title'],movies[-3:])) + for title in last_movies: + print('\t\t'+title) done_ids.append(str(g_id)) -print("Pulled movies for genres - "+','.join(done_ids)) +print("\tPulled movies for genres - "+','.join(done_ids)) +print('\n') # Remove duplicates movie_ids = [m['id'] for m in movies] -print ("originally we had ",len(movie_ids)," movies") +print ("Originally we had ",len(movie_ids)," movies") movie_ids=np.unique(movie_ids) -print (len(movie_ids)) seen_before=[] no_duplicate_movies=[] for i in range(len(movies)): @@ -70,13 +78,29 @@ no_duplicate_movies.append(movie) print ("After removing duplicates we have ",len(no_duplicate_movies), " movies") +print('\n') - -with open('data/interim/movie_list.pkl','wb') as f: - pickle.dump(top1000_movies,f) + +# print("Saving the list of top 1000 movies (top1000_movies) as data/interim/movie_list.pkl...") +# print('Here are the first 3 entries in top1000_movies:') +# print(top1000_movies[:2]) +# with open('data/interim/movie_list.pkl','wb') as f: +# pickle.dump(top1000_movies,f) +# print("Saved the list of top 1000 movies as data/interim/movie_list.pkl.") + +print("Saving the list of de-duped list of movies (no_duplicate_movies) as data/interim/no_duplicate_movies.pkl...") +print('\tHere are the first 3 entries in no_duplicate_movies:') +pprint.pprint(no_duplicate_movies[:3], indent=4) with open('data/interim/no_duplicate_movies.pkl', 'wb') as f: pickle.dump(no_duplicate_movies, f) -with open('data/interim/movies.pkl', 'wb') as f: - pickle.dump(movies, f) - +print("Saved the list of de-duped list of movies as data/interim/no_duplicate_movies.pkl.") + +# print("Saving the list of movies pulled by genre (movies) as data/interim/movies.pkl...") +# print('Here are the first 3 entries in movies:') +# print(movies[:2]) +# with open('data/interim/movies.pkl', 'wb') as f: +# pickle.dump(movies, f) +# print("Saved the list of movies pulled by genre (movies) as data/interim/movies.pkl.") + + ## TODO include a dominostats.json diff --git a/src/data/overviews.py b/src/data/overviews.py index c22e8ad..108d718 100644 --- a/src/data/overviews.py +++ b/src/data/overviews.py @@ -1,14 +1,18 @@ from src.utils.initialize import * +import pprint # build dataset # cleaning # load no_duplicate_movies +print("Loading the list of de-duped movies from data/interim/no_duplicate_movies.pkl...") with open('data/interim/no_duplicate_movies.pkl','rb') as f: no_duplicate_movies=pickle.load(f) +print("Loaded the list of de-duped movies from data/interim/no_duplicate_movies.pkl.\n") # get movies with overviews +print("Creating a dataset where each movie must have an associated overview...") movies_with_overviews=[] # from poster data for i in range(len(no_duplicate_movies)): movie=no_duplicate_movies[i] @@ -19,8 +23,13 @@ continue else: movies_with_overviews.append(movie) - +print("Done! Created a dataset where each movie must have an associated overview.\n") len(movies_with_overviews) + +print("Saving the list of movies that have overviews (movies_with_overviews) as data/interim/movies_with_overviews.pkl....") +print('\tHere are the first entry in movies_with_overviews:') +pprint.pprint(movies_with_overviews[0], indent=4) with open('data/interim/movies_with_overviews.pkl','wb') as f: - pickle.dump(movies_with_overviews,f) \ No newline at end of file + pickle.dump(movies_with_overviews,f) +print("Saved the list of movies that have overviews (movies_with_overviews) as data/interim/movies_with_overviews.pkl.") \ No newline at end of file diff --git a/src/features/feature_eng.py b/src/features/feature_eng.py index df470d4..fde2bca 100644 --- a/src/features/feature_eng.py +++ b/src/features/feature_eng.py @@ -1,13 +1,16 @@ from src.utils.initialize import * -import re +# import re with open('data/processed/Y.pkl','rb') as f: Y=pickle.load(f) -with open('data/processed/movies_with_overviews.pkl','rb') as f: +print("Loaded the target variable from to data/processed/Y.pkl.\n") +with open('data/interim/movies_with_overviews.pkl','rb') as f: movies_with_overviews=pickle.load(f) +print("Loaded the list of de-duped movies with overviews from data/interim/movies_with_overviews.pkl.") with open('data/processed/Genredict.pkl','rb') as f: - Genre_ID_to_name=pickle.load(f) + Genre_ID_to_name=pickle.load(f) +print('Loaded the mapping from genre id to genre name from data/processed/Genredict.pkl.') genre_names=list(Genre_ID_to_name.values()) @@ -17,6 +20,7 @@ def remove_punctuation(input_string): cleaned_string = input_string.replace('.','') return cleaned_string + content=[] for i in range(len(movies_with_overviews)): movie=movies_with_overviews[i] @@ -24,28 +28,38 @@ def remove_punctuation(input_string): overview=movie['overview'] overview=remove_punctuation(overview) content.append(overview) - +print("Removed punctuation from the overviews.") # Count Vectorize + from sklearn.feature_extraction.text import CountVectorizer vectorize=CountVectorizer(max_df=0.95, min_df=0.005) X=vectorize.fit_transform(content) -print("Shape of X with count vectorizer:") -print(X.shape) +print("Vectorized the text of the overviews using the CountVectorizer from scikit-learn. This is basically the bag of words model.") +print("\tShape of X with count vectorizer:") +print('\t'+str(X.shape)) + with open('data/processed/X.pkl','wb') as f: pickle.dump(X,f) with open('models/count_vectorizer.pkl','wb') as f: pickle.dump(vectorize,f) +print("\tSaved X to data/processed/X.pkl and the vectorizer as models/count_vectorizer.pkl.") +print('\tHere are the first row of X (remember that it is a sparse matrix):') +print('\t {X}'.format(X=X[0])) # TF-IDF from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_tfidf = tfidf_transformer.fit_transform(X) -print("Shape of X_tfidf:") -print(X_tfidf.shape) +print("Vectorized the text of the overviews using the TfidfVectorizer from scikit-learn.") +print("\tShape of X with TF-IDF vectorizer:") +print('\t'+str(X_tfidf.shape)) with open('data/processed/X_tfidf.pkl','wb') as f: pickle.dump(X_tfidf,f) with open('models/tfidf_transformer.pkl','wb') as f: pickle.dump(tfidf_transformer,f) +print("\tSaved X_tfidf to data/processed/X_tfidf.pkl and the vectorizer as models/tfidf_transformer.pkl.") +print('\tHere are the first row of X_tfidf (remember that it is as sparse matrix:') +print('\t {X}'.format(X=X_tfidf[0])) diff --git a/src/features/word2vec_features.py b/src/features/word2vec_features.py index eda6bf0..ee751b7 100644 --- a/src/features/word2vec_features.py +++ b/src/features/word2vec_features.py @@ -5,13 +5,14 @@ import os from sklearn.model_selection import train_test_split -with open('data/processed/movies_with_overviews.pkl','rb') as f: +with open('data/interim/movies_with_overviews.pkl','rb') as f: final_movies_set=pickle.load(f) +print("Loaded the list of de-duped movies with overviews from data/interim/movies_with_overviews.pkl.") from gensim import models model2 = models.KeyedVectors.load_word2vec_format('data/external/GoogleNews-vectors-negative300-SLIM.bin', binary=True) - +print("Loaded the GoogleNews Slimmed Word2Vec model.") from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words diff --git a/src/models/get_word2vec.sh b/src/models/get_word2vec.sh index 4426549..888fa12 100644 --- a/src/models/get_word2vec.sh +++ b/src/models/get_word2vec.sh @@ -1,8 +1,9 @@ #!/bin/bash # wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" +echo "Downloading the SLIMMED word2vec model..." wget https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz -echo "Decompressing..." +echo "Decompressing the model..." gunzip -f -v GoogleNews-vectors-negative300-SLIM.bin.gz echo "Decompressed. Moving..." mv GoogleNews-vectors-negative300-SLIM.bin /mnt/data/external/GoogleNews-vectors-negative300-SLIM.bin diff --git a/src/utils/initialize.py b/src/utils/initialize.py index b5e02a4..7469a0d 100644 --- a/src/utils/initialize.py +++ b/src/utils/initialize.py @@ -25,7 +25,7 @@ # set here the path where you want the scraped folders to be saved! poster_folder='data/raw/posters/' if poster_folder.split('/')[0] in os.listdir('./'): - print('Folder already exists') + pass else: os.mkdir('./'+poster_folder)