Skip to content
This repository has been archived by the owner on May 27, 2024. It is now read-only.

feat: Add initial implementation of data analysis tools #1

Merged
merged 21 commits into from
Feb 10, 2021
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f7001e1
Chore: Re-pushing the project to a fresh branch
AnkitRajSri Dec 8, 2020
bca19ce
Delete env
AnkitRajSri Dec 12, 2020
5885352
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
7c532da
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
f7f9f65
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
a461f4f
Chore: Added comments for the file
AnkitRajSri Dec 16, 2020
cf0722f
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
f61d8e4
Chore: Hide the secret key
AnkitRajSri Dec 16, 2020
f36d217
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
1d58884
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
2298b63
Chore: Updated the gitignore file for .pyc files
AnkitRajSri Dec 16, 2020
701aef3
Chore: No updates
AnkitRajSri Dec 16, 2020
0ab3245
Chore: Created sql schema file
AnkitRajSri Dec 16, 2020
ec2abc6
Chore: Created README file
AnkitRajSri Dec 16, 2020
212aeed
Chore: Added comments for the scripts, removed hardcoded path
AnkitRajSri Dec 16, 2020
111d5fb
Chore: Added requirements file
AnkitRajSri Dec 16, 2020
b54bee7
Chore: Hide the secret key
AnkitRajSri Dec 21, 2020
fdd4de4
Chore: Added config.py guidelines in instructions in the README
AnkitRajSri Dec 21, 2020
90fed61
Chore: Minor UI update
AnkitRajSri Dec 30, 2020
5950dd9
Fix: Fixed data builder issue
AnkitRajSri Jan 15, 2021
dccfeb3
Chore: Adding README file
AnkitRajSri Feb 3, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
7 changes: 7 additions & 0 deletions AI/ETL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class ETL:
def __init__(self, conn):
self.conn = conn

def build_final_table(self):
with self.conn.begin() as conn:
conn.execute('exec dbo.BLD_SPOTIFY_DATA')
1 change: 1 addition & 0 deletions AI/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# __init__ method to make sure that the folder is considered as a python package
Binary file added AI/__pycache__/ETL.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/models.cpython-37.pyc
Binary file not shown.
Binary file added AI/__pycache__/muserdatabuilder.cpython-37.pyc
Binary file not shown.
Binary file not shown.
80 changes: 80 additions & 0 deletions AI/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
'''
/*
* Copyright (C) 2019-2020 University of South Florida
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'''

# Import the dependencies
import pandas as pd
from textslack.textslack import TextSlack
from gensim.models import doc2vec

AnkitRajSri marked this conversation as resolved.
Show resolved Hide resolved

# Architecture of the NLP Model
class NLPModel:

# The constructor instantiates all the variables that would be used throughout the class
def __init__(self, sp, conn, max_epochs=100, vec_size=50, alpha=0.025):
self.sp = sp
self.conn = conn
self.slack = TextSlack(variety='BrE', lang='english')
self.max_epochs = max_epochs
self.vec_size = vec_size
self.alpha = alpha
self.df = pd.read_sql_table('SPOTIFY_DATA', con=self.conn)

# Function that tags the list of words with indices
def _create_tagged_document(self, list_of_list_of_words):
for i, list_of_words in enumerate(list_of_list_of_words):
yield doc2vec.TaggedDocument(list_of_words, [i])

# Function to prepare the training data
def _training_data(self):
key_features = (self.df['album'] + ' ' + self.df['name'] + ' ' + self.df['artist']).tolist()
cleaned_key_features = self.slack.transform(key_features)
list_list_words = [sent.split() for sent in cleaned_key_features]
return list_list_words

# Function to build and train the model
def build_model(self):
list_list_words = self._training_data()
train_data = list(self._create_tagged_document(list_list_words))
model = doc2vec.Doc2Vec(size=self.vec_size,
alpha=self.alpha,
min_alpha=0.00025,
min_count=1,
dm=1)
model.build_vocab(train_data)
for epoch in range(self.max_epochs):
print('iteration {0}'.format(epoch))
model.train(train_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save('d2v.model')
print("Model Saved")

# Function to predict the most similar doc in the doc2vec model
def most_similar_doc(self, target):
model = doc2vec.Doc2Vec.load('d2v.model')
model.random.seed(95)
cleaned_target = self.slack.transform(target).split()
pred_vector = model.infer_vector(cleaned_target)
sim_vector = model.docvecs.most_similar([pred_vector])
pred_index = sim_vector[0][0]
return self.df.loc[pred_index, self.df.columns[6:-1]]
73 changes: 73 additions & 0 deletions AI/muserdatabuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
'''
/*
* Copyright (C) 2019-2020 University of South Florida
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'''

# Import the dependencies
import pandas as pd
AnkitRajSri marked this conversation as resolved.
Show resolved Hide resolved
from AI.models import NLPModel


AnkitRajSri marked this conversation as resolved.
Show resolved Hide resolved
# Architecture of the Muser Data Builder
class MuserDataBuilder:

# The constructor instantiates all the variables that would be used throughout the class
def __init__(self, sp, conn):
self.sp = sp
self.conn = conn
self.df = pd.read_csv('music-analysis.csv')

# Function to add feature columns to the muser data
# Replace the existing csv
def build_muser_data(self):
self.df['acousticness'] = '' * self.df.shape[0]
self.df['danceability'] = '' * self.df.shape[0]
self.df['energy'] = '' * self.df.shape[0]
self.df['instrumentalness'] = '' * self.df.shape[0]
self.df['liveness'] = '' * self.df.shape[0]
self.df['loudness'] = '' * self.df.shape[0]
self.df['speechiness'] = '' * self.df.shape[0]
self.df['tempo'] = '' * self.df.shape[0]
self.df['valence'] = '' * self.df.shape[0]
self.df['popularity'] = '' * self.df.shape[0]
for idx in self.df.index:
album = self.df.loc[idx, 'song_album_name']
track = self.df.loc[idx, 'song_name']
artist = self.df.loc[idx, 'song_artist_name']
query = 'album:{} track:{} artist:{}'.format(album, track, artist)
spotify_search = self.sp.search(query, limit=1, offset=0, type='track', market=None)

if len(spotify_search['tracks']['items']) > 0:
track_uri = spotify_search['tracks']['items'][0]['uri']
audio_features = self.sp.audio_features(track_uri)[0]
self.df.loc[idx, 'popularity'] = self.sp.track(track_uri)['popularity']
else:
target = album + ' ' + track + ' ' + artist
nlp_model = NLPModel(self.sp, self.conn)
audio_features = nlp_model.most_similar_doc(target)
self.df.loc[idx, 'popularity'] = audio_features['popularity']

self.df.loc[idx, 'acousticness'] = audio_features['acousticness']
self.df.loc[idx, 'danceability'] = audio_features['danceability']
self.df.loc[idx, 'energy'] = audio_features['energy']
self.df.loc[idx, 'instrumentalness'] = audio_features['instrumentalness']
self.df.loc[idx, 'liveness'] = audio_features['liveness']
self.df.loc[idx, 'loudness'] = audio_features['loudness']
self.df.loc[idx, 'speechiness'] = audio_features['speechiness']
self.df.loc[idx, 'tempo'] = audio_features['tempo']
self.df.loc[idx, 'valence'] = audio_features['valence']

self.df.to_csv('music-analysis.csv')
Loading