Skip to content
This repository has been archived by the owner on May 27, 2024. It is now read-only.

Commit

Permalink
feat: Add initial implementation of data analysis tools (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
AnkitRajSri authored Feb 10, 2021
1 parent 066b81a commit 04fa9c3
Show file tree
Hide file tree
Showing 45 changed files with 31,470 additions and 0 deletions.
138 changes: 138 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
7 changes: 7 additions & 0 deletions AI/ETL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class ETL:
def __init__(self, conn):
self.conn = conn

def build_final_table(self):
with self.conn.begin() as conn:
conn.execute('exec dbo.BLD_SPOTIFY_DATA')
1 change: 1 addition & 0 deletions AI/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# __init__ method to make sure that the folder is considered as a python package
Binary file added AI/__pycache__/ETL.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/models.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/muserdatabuilder.cpython-37.pyc
Binary file not shown.
Binary file not shown.
80 changes: 80 additions & 0 deletions AI/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
'''
/*
* Copyright (C) 2019-2020 University of South Florida
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'''

# Import the dependencies
import pandas as pd
from textslack.textslack import TextSlack
from gensim.models import doc2vec


# Architecture of the NLP Model
class NLPModel:

# The constructor instantiates all the variables that would be used throughout the class
def __init__(self, sp, conn, max_epochs=100, vec_size=50, alpha=0.025):
self.sp = sp
self.conn = conn
self.slack = TextSlack(variety='BrE', lang='english')
self.max_epochs = max_epochs
self.vec_size = vec_size
self.alpha = alpha
self.df = pd.read_sql_table('SPOTIFY_DATA', con=self.conn)

# Function that tags the list of words with indices
def _create_tagged_document(self, list_of_list_of_words):
for i, list_of_words in enumerate(list_of_list_of_words):
yield doc2vec.TaggedDocument(list_of_words, [i])

# Function to prepare the training data
def _training_data(self):
key_features = (self.df['album'] + ' ' + self.df['name'] + ' ' + self.df['artist']).tolist()
cleaned_key_features = self.slack.transform(key_features)
list_list_words = [sent.split() for sent in cleaned_key_features]
return list_list_words

# Function to build and train the model
def build_model(self):
list_list_words = self._training_data()
train_data = list(self._create_tagged_document(list_list_words))
model = doc2vec.Doc2Vec(size=self.vec_size,
alpha=self.alpha,
min_alpha=0.00025,
min_count=1,
dm=1)
model.build_vocab(train_data)
for epoch in range(self.max_epochs):
print('iteration {0}'.format(epoch))
model.train(train_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save('d2v.model')
print("Model Saved")

# Function to predict the most similar doc in the doc2vec model
def most_similar_doc(self, target):
model = doc2vec.Doc2Vec.load('d2v.model')
model.random.seed(95)
cleaned_target = self.slack.transform(target).split()
pred_vector = model.infer_vector(cleaned_target)
sim_vector = model.docvecs.most_similar([pred_vector])
pred_index = sim_vector[0][0]
return self.df.loc[pred_index, self.df.columns[6:-1]]
84 changes: 84 additions & 0 deletions AI/muserdatabuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
'''
/*
* Copyright (C) 2019-2020 University of South Florida
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'''

# Import the dependencies
import pandas as pd
import time
import numpy as np
from AI.models import NLPModel


# Architecture of the Muser Data Builder
class MuserDataBuilder:

# The constructor instantiates all the variables that would be used throughout the class
def __init__(self, sp, conn):
self.sp = sp
self.conn = conn
self.df = pd.read_csv('music-analysis.csv')

# Function to add feature columns to the muser data
# Replace the existing csv
def build_muser_data(self):
self.df['acousticness'] = '' * self.df.shape[0]
self.df['danceability'] = '' * self.df.shape[0]
self.df['energy'] = '' * self.df.shape[0]
self.df['instrumentalness'] = '' * self.df.shape[0]
self.df['liveness'] = '' * self.df.shape[0]
self.df['loudness'] = '' * self.df.shape[0]
self.df['speechiness'] = '' * self.df.shape[0]
self.df['tempo'] = '' * self.df.shape[0]
self.df['valence'] = '' * self.df.shape[0]
self.df['popularity'] = '' * self.df.shape[0]

sleep_min = 2
sleep_max = 5
request_count = 0

for idx in self.df.index:
album = self.df.loc[idx, 'song_album_name']
track = self.df.loc[idx, 'song_name']
artist = self.df.loc[idx, 'song_artist_name']
query = 'album:{} track:{} artist:{}'.format(album, track, artist)
spotify_search = self.sp.search(query, limit=1, offset=0, type='track', market=None)

request_count += 1
if request_count % 5 == 0:
time.sleep(np.random.uniform(sleep_min, sleep_max))

if len(spotify_search['tracks']['items']) > 0:
track_uri = spotify_search['tracks']['items'][0]['uri']
audio_features = self.sp.audio_features(track_uri)[0]
self.df.loc[idx, 'popularity'] = self.sp.track(track_uri)['popularity']
else:
target = album + ' ' + track + ' ' + artist
nlp_model = NLPModel(self.sp, self.conn)
audio_features = nlp_model.most_similar_doc(target)
self.df.loc[idx, 'popularity'] = audio_features['popularity']

self.df.loc[idx, 'acousticness'] = audio_features['acousticness']
self.df.loc[idx, 'danceability'] = audio_features['danceability']
self.df.loc[idx, 'energy'] = audio_features['energy']
self.df.loc[idx, 'instrumentalness'] = audio_features['instrumentalness']
self.df.loc[idx, 'liveness'] = audio_features['liveness']
self.df.loc[idx, 'loudness'] = audio_features['loudness']
self.df.loc[idx, 'speechiness'] = audio_features['speechiness']
self.df.loc[idx, 'tempo'] = audio_features['tempo']
self.df.loc[idx, 'valence'] = audio_features['valence']

self.df.to_csv('music-analysis.csv')
Loading

0 comments on commit 04fa9c3

Please sign in to comment.