Skip to content

Commit

Permalink
add stdout messaging during data prep process
Browse files Browse the repository at this point in the history
  • Loading branch information
John Joo committed Apr 19, 2019
1 parent 7fb600c commit a0b02ea
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 50 deletions.
58 changes: 39 additions & 19 deletions src/data/cleaning_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from src.utils.initialize import *
import pprint

# make sure there are same number of distinct genres in movies_with_overviews

Expand All @@ -7,15 +8,21 @@
# cleaning

# load no_duplicate_movies
with open('data/interim/no_duplicate_movies.pkl','rb') as f:
no_duplicate_movies=pickle.load(f)
# # print("Loading the list of de-duped movies from data/interim/no_duplicate_movies.pkl...")
# with open('data/interim/no_duplicate_movies.pkl','rb') as f:
# no_duplicate_movies=pickle.load(f)
# print("Loaded the list of de-duped movies from data/interim/no_duplicate_movies.pkl.")

# print("Loading the list of movies that have overviews from data/interim/movies_with_overviews.pkl...")
with open('data/interim/movies_with_overviews.pkl','rb') as f:
movies_with_overviews=pickle.load(f)
print("Loaded the list of movies that have overviews from data/interim/movies_with_overviews.pkl.\n")



# Y
# list of genres and movie ids in prep for binarizination
print("Extracting the genres and movie ids in prep for binarizination...")
genres=[]
all_ids=[]
for i in range(len(movies_with_overviews)):
Expand All @@ -26,14 +33,17 @@
all_ids.extend(genre_ids)

# binarize the genres for each movie
print('Binarizing the list of genres to create the target variable Y.')
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genres)

print("Done! Y created. Shape of Y is ")
print (Y.shape)
print('\n')

# tmdb package provides a method that will propvide a dictionary that maps genre ids to genre name.
# we may need to add something if that list is incorrect.
print("Creating a mapping from the genre ids to the genre names...")
genres=tmdb.Genres()
# the movie_list() method of the Genres() class returns a listing of all genres in the form of a dictionary.
list_of_genres=genres.movie_list()['genres']
Expand All @@ -47,29 +57,39 @@
print(i)
if i == 10769:
Genre_ID_to_name[10769]="Foreign" # look up what the above genre ids are. see if there's a programmatic way to do it
print("Mapping from genre id to genre name is saved in the Genre_ID_to_name dictionary:")
pprint.pprint(Genre_ID_to_name, indent=4)
print('\n')

# import re


import re

# remove some punctuation. probably a much better way to do this
content=[]
for i in range(len(movies_with_overviews)):
movie=movies_with_overviews[i]
id=movie['id']
overview=movie['overview']
overview=overview.replace(',','')
overview=overview.replace('.','')
content.append(overview)
# # remove some punctuation. probably a much better way to do this
# content=[]
# for i in range(len(movies_with_overviews)):
# movie=movies_with_overviews[i]
# id=movie['id']
# overview=movie['overview']
# overview=overview.replace(',','')
# overview=overview.replace('.','')
# content.append(overview)



import pickle

with open('data/processed/Y.pkl','wb') as f:
pickle.dump(Y,f)
# print('Saving the mapping from genre id to genre name as data/processed/Genredict.pkl...')
with open('data/processed/Genredict.pkl','wb') as f:
pickle.dump(Genre_ID_to_name,f)
with open('data/processed/movies_with_overviews.pkl','wb') as f:
pickle.dump(movies_with_overviews,f)
print('Saved the mapping from genre id to genre name as data/processed/Genredict.pkl.')

# print("Saving the target variable Y to data/processed/Y.pkl...")
with open('data/processed/Y.pkl','wb') as f:
pickle.dump(Y,f)
print("Saved the target variable Y to data/processed/Y.pkl.\n")
print('\tHere are the first few lines of Y:')
print('\t'+str(Y[:5]))


# with open('data/processed/movies_with_overviews.pkl','wb') as f:
# pickle.dump(movies_with_overviews,f)

58 changes: 41 additions & 17 deletions src/data/movie_list.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,33 @@
from src.utils.initialize import *
import pprint

# Get the text data from top 1000 popular movies ########
all_movies=tmdb.Movies()
top_movies=all_movies.popular()

# TODO parameterize by making top N movies
all_movies=tmdb.Movies()
top1000_movies=[]
print('Pulling movie list, Please wait...')
print('Pulling movie list of popular movies, Please wait...')
print('\tWhile you wait, here are some sampling of the movies that are being pulled...')
for i in range(1,51):
if i%10==0:
print('\t' + str(i) + '/51 done')
print('\t******* Waiting a few seconds to stay within rate limits of TMDB... *******)')
time.sleep(7)
print(str(i)+'/51...')
movies_on_this_page=all_movies.popular(page=i)['results']
print('\t\t'+movies_on_this_page[-1]['title'])
top1000_movies.extend(movies_on_this_page)
len(top1000_movies)

print('Done!')
print('Done! Pulled a list of the top {n} movies.'.format(n = len(top1000_movies)))
print('\n')

print('Extracting the genre ids associated with the movies....')
genre_ids_ = list(map(lambda x: x['genre_ids'], top1000_movies))
genre_ids_ = [item for sublist in genre_ids_ for item in sublist]
nr_ids = list(set(genre_ids_))

print('Done! We have identified {n} genres in the top {m} most popular movies.'.format(n=len(nr_ids), m=len(top1000_movies)))
print('\n')

##############################
# Get poster data from another sample of movies from the genres listed in the top 1000 movies for a specific year #################
Expand All @@ -33,10 +38,10 @@
movies = []
baseyear = 2017

print('Starting pulling movies from TMDB. This will take a while, please wait...')
print('Starting pulling movies from TMDB from each genre. This will take a while, please wait...')
done_ids=[]
for g_id in nr_ids:
print('Pulling movies for genre ID '+str(g_id))
print('\tPulling movies for genre ID {g_id}. Here are sample of movies in the genre: '.format(g_id = str(g_id)) )
baseyear -= 1
for page in range(1,6,1): # (1,6,1)
time.sleep(1)
Expand All @@ -49,14 +54,17 @@

dataDict = json.loads(data)
movies.extend(dataDict["results"])
last_movies = list(map(lambda x: x['title'],movies[-3:]))
for title in last_movies:
print('\t\t'+title)
done_ids.append(str(g_id))
print("Pulled movies for genres - "+','.join(done_ids))
print("\tPulled movies for genres - "+','.join(done_ids))
print('\n')

# Remove duplicates
movie_ids = [m['id'] for m in movies]
print ("originally we had ",len(movie_ids)," movies")
print ("Originally we had ",len(movie_ids)," movies")
movie_ids=np.unique(movie_ids)
print (len(movie_ids))
seen_before=[]
no_duplicate_movies=[]
for i in range(len(movies)):
Expand All @@ -70,13 +78,29 @@
no_duplicate_movies.append(movie)

print ("After removing duplicates we have ",len(no_duplicate_movies), " movies")
print('\n')


with open('data/interim/movie_list.pkl','wb') as f:
pickle.dump(top1000_movies,f)

# print("Saving the list of top 1000 movies (top1000_movies) as data/interim/movie_list.pkl...")
# print('Here are the first 3 entries in top1000_movies:')
# print(top1000_movies[:2])
# with open('data/interim/movie_list.pkl','wb') as f:
# pickle.dump(top1000_movies,f)
# print("Saved the list of top 1000 movies as data/interim/movie_list.pkl.")

print("Saving the list of de-duped list of movies (no_duplicate_movies) as data/interim/no_duplicate_movies.pkl...")
print('\tHere are the first 3 entries in no_duplicate_movies:')
pprint.pprint(no_duplicate_movies[:3], indent=4)
with open('data/interim/no_duplicate_movies.pkl', 'wb') as f:
pickle.dump(no_duplicate_movies, f)
with open('data/interim/movies.pkl', 'wb') as f:
pickle.dump(movies, f)

print("Saved the list of de-duped list of movies as data/interim/no_duplicate_movies.pkl.")

# print("Saving the list of movies pulled by genre (movies) as data/interim/movies.pkl...")
# print('Here are the first 3 entries in movies:')
# print(movies[:2])
# with open('data/interim/movies.pkl', 'wb') as f:
# pickle.dump(movies, f)
# print("Saved the list of movies pulled by genre (movies) as data/interim/movies.pkl.")


## TODO include a dominostats.json
13 changes: 11 additions & 2 deletions src/data/overviews.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from src.utils.initialize import *
import pprint

# build dataset

# cleaning

# load no_duplicate_movies
print("Loading the list of de-duped movies from data/interim/no_duplicate_movies.pkl...")
with open('data/interim/no_duplicate_movies.pkl','rb') as f:
no_duplicate_movies=pickle.load(f)
print("Loaded the list of de-duped movies from data/interim/no_duplicate_movies.pkl.\n")

# get movies with overviews
print("Creating a dataset where each movie must have an associated overview...")
movies_with_overviews=[] # from poster data
for i in range(len(no_duplicate_movies)):
movie=no_duplicate_movies[i]
Expand All @@ -19,8 +23,13 @@
continue
else:
movies_with_overviews.append(movie)
print("Done! Created a dataset where each movie must have an associated overview.\n")
len(movies_with_overviews)


print("Saving the list of movies that have overviews (movies_with_overviews) as data/interim/movies_with_overviews.pkl....")
print('\tHere are the first entry in movies_with_overviews:')
pprint.pprint(movies_with_overviews[0], indent=4)
with open('data/interim/movies_with_overviews.pkl','wb') as f:
pickle.dump(movies_with_overviews,f)
pickle.dump(movies_with_overviews,f)
print("Saved the list of movies that have overviews (movies_with_overviews) as data/interim/movies_with_overviews.pkl.")
30 changes: 22 additions & 8 deletions src/features/feature_eng.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from src.utils.initialize import *
import re
# import re


with open('data/processed/Y.pkl','rb') as f:
Y=pickle.load(f)
with open('data/processed/movies_with_overviews.pkl','rb') as f:
print("Loaded the target variable from to data/processed/Y.pkl.\n")
with open('data/interim/movies_with_overviews.pkl','rb') as f:
movies_with_overviews=pickle.load(f)
print("Loaded the list of de-duped movies with overviews from data/interim/movies_with_overviews.pkl.")
with open('data/processed/Genredict.pkl','rb') as f:
Genre_ID_to_name=pickle.load(f)
Genre_ID_to_name=pickle.load(f)
print('Loaded the mapping from genre id to genre name from data/processed/Genredict.pkl.')

genre_names=list(Genre_ID_to_name.values())

Expand All @@ -17,35 +20,46 @@ def remove_punctuation(input_string):
cleaned_string = input_string.replace('.','')
return cleaned_string


content=[]
for i in range(len(movies_with_overviews)):
movie=movies_with_overviews[i]
id=movie['id']
overview=movie['overview']
overview=remove_punctuation(overview)
content.append(overview)

print("Removed punctuation from the overviews.")

# Count Vectorize

from sklearn.feature_extraction.text import CountVectorizer
vectorize=CountVectorizer(max_df=0.95, min_df=0.005)
X=vectorize.fit_transform(content)
print("Shape of X with count vectorizer:")
print(X.shape)
print("Vectorized the text of the overviews using the CountVectorizer from scikit-learn. This is basically the bag of words model.")
print("\tShape of X with count vectorizer:")
print('\t'+str(X.shape))

with open('data/processed/X.pkl','wb') as f:
pickle.dump(X,f)
with open('models/count_vectorizer.pkl','wb') as f:
pickle.dump(vectorize,f)
print("\tSaved X to data/processed/X.pkl and the vectorizer as models/count_vectorizer.pkl.")
print('\tHere are the first row of X (remember that it is a sparse matrix):')
print('\t {X}'.format(X=X[0]))

# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)
print("Shape of X_tfidf:")
print(X_tfidf.shape)
print("Vectorized the text of the overviews using the TfidfVectorizer from scikit-learn.")
print("\tShape of X with TF-IDF vectorizer:")
print('\t'+str(X_tfidf.shape))
with open('data/processed/X_tfidf.pkl','wb') as f:
pickle.dump(X_tfidf,f)
with open('models/tfidf_transformer.pkl','wb') as f:
pickle.dump(tfidf_transformer,f)
print("\tSaved X_tfidf to data/processed/X_tfidf.pkl and the vectorizer as models/tfidf_transformer.pkl.")
print('\tHere are the first row of X_tfidf (remember that it is as sparse matrix:')
print('\t {X}'.format(X=X_tfidf[0]))


5 changes: 3 additions & 2 deletions src/features/word2vec_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import os
from sklearn.model_selection import train_test_split

with open('data/processed/movies_with_overviews.pkl','rb') as f:
with open('data/interim/movies_with_overviews.pkl','rb') as f:
final_movies_set=pickle.load(f)
print("Loaded the list of de-duped movies with overviews from data/interim/movies_with_overviews.pkl.")


from gensim import models
model2 = models.KeyedVectors.load_word2vec_format('data/external/GoogleNews-vectors-negative300-SLIM.bin', binary=True)

print("Loaded the GoogleNews Slimmed Word2Vec model.")

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
Expand Down
3 changes: 2 additions & 1 deletion src/models/get_word2vec.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/bash

# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
echo "Downloading the SLIMMED word2vec model..."
wget https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz
echo "Decompressing..."
echo "Decompressing the model..."
gunzip -f -v GoogleNews-vectors-negative300-SLIM.bin.gz
echo "Decompressed. Moving..."
mv GoogleNews-vectors-negative300-SLIM.bin /mnt/data/external/GoogleNews-vectors-negative300-SLIM.bin
Expand Down
2 changes: 1 addition & 1 deletion src/utils/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# set here the path where you want the scraped folders to be saved!
poster_folder='data/raw/posters/'
if poster_folder.split('/')[0] in os.listdir('./'):
print('Folder already exists')
pass
else:
os.mkdir('./'+poster_folder)

Expand Down

0 comments on commit a0b02ea

Please sign in to comment.