feat: Add initial implementation of data analysis tools (#1)

CUTR-at-USF · Feb 10, 2021 · 04fa9c3 · 04fa9c3
1 parent 066b81a
commit 04fa9c3
Show file tree

Hide file tree

Showing 45 changed files with 31,470 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/AI/ETL.py b/AI/ETL.py
@@ -0,0 +1,7 @@
+class ETL:
+    def __init__(self, conn):
+        self.conn = conn
+
+    def build_final_table(self):
+        with self.conn.begin() as conn:
+            conn.execute('exec dbo.BLD_SPOTIFY_DATA')
diff --git a/AI/__init__.py b/AI/__init__.py
@@ -0,0 +1 @@
+# __init__ method to make sure that the folder is considered as a python package
diff --git a/AI/__pycache__/ETL.cpython-37.pyc b/AI/__pycache__/ETL.cpython-37.pyc
diff --git a/AI/__pycache__/__init__.cpython-37.pyc b/AI/__pycache__/__init__.cpython-37.pyc
diff --git a/AI/__pycache__/models.cpython-37.pyc b/AI/__pycache__/models.cpython-37.pyc
diff --git a/AI/__pycache__/muserdatabuilder.cpython-37.pyc b/AI/__pycache__/muserdatabuilder.cpython-37.pyc
diff --git a/AI/__pycache__/spotifydataextractor.cpython-37.pyc b/AI/__pycache__/spotifydataextractor.cpython-37.pyc
diff --git a/AI/models.py b/AI/models.py
@@ -0,0 +1,80 @@
+'''
+/*
+ * Copyright (C) 2019-2020 University of South Florida
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ '''
+
+# Import the dependencies
+import pandas as pd
+from textslack.textslack import TextSlack
+from gensim.models import doc2vec
+
+
+# Architecture of the NLP Model
+class NLPModel:
+
+    # The constructor instantiates all the variables that would be used throughout the class
+    def __init__(self, sp, conn, max_epochs=100, vec_size=50, alpha=0.025):
+        self.sp = sp
+        self.conn = conn
+        self.slack = TextSlack(variety='BrE', lang='english')
+        self.max_epochs = max_epochs
+        self.vec_size = vec_size
+        self.alpha = alpha
+        self.df = pd.read_sql_table('SPOTIFY_DATA', con=self.conn)
+
+    # Function that tags the list of words with indices
+    def _create_tagged_document(self, list_of_list_of_words):
+        for i, list_of_words in enumerate(list_of_list_of_words):
+            yield doc2vec.TaggedDocument(list_of_words, [i])
+
+    # Function to prepare the training data
+    def _training_data(self):
+        key_features = (self.df['album'] + ' ' + self.df['name'] + ' ' + self.df['artist']).tolist()
+        cleaned_key_features = self.slack.transform(key_features)
+        list_list_words = [sent.split() for sent in cleaned_key_features]
+        return list_list_words
+
+    # Function to build and train the model
+    def build_model(self):
+        list_list_words = self._training_data()
+        train_data = list(self._create_tagged_document(list_list_words))
+        model = doc2vec.Doc2Vec(size=self.vec_size,
+                                alpha=self.alpha,
+                                min_alpha=0.00025,
+                                min_count=1,
+                                dm=1)
+        model.build_vocab(train_data)
+        for epoch in range(self.max_epochs):
+            print('iteration {0}'.format(epoch))
+            model.train(train_data,
+                        total_examples=model.corpus_count,
+                        epochs=model.iter)
+            # decrease the learning rate
+            model.alpha -= 0.0002
+            # fix the learning rate, no decay
+            model.min_alpha = model.alpha
+        model.save('d2v.model')
+        print("Model Saved")
+
+    # Function to predict the most similar doc in the doc2vec model
+    def most_similar_doc(self, target):
+        model = doc2vec.Doc2Vec.load('d2v.model')
+        model.random.seed(95)
+        cleaned_target = self.slack.transform(target).split()
+        pred_vector = model.infer_vector(cleaned_target)
+        sim_vector = model.docvecs.most_similar([pred_vector])
+        pred_index = sim_vector[0][0]
+        return self.df.loc[pred_index, self.df.columns[6:-1]]
diff --git a/AI/muserdatabuilder.py b/AI/muserdatabuilder.py
@@ -0,0 +1,84 @@
+'''
+/*
+ * Copyright (C) 2019-2020 University of South Florida
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ '''
+
+# Import the dependencies
+import pandas as pd
+import time
+import numpy as np
+from AI.models import NLPModel
+
+
+# Architecture of the Muser Data Builder
+class MuserDataBuilder:
+
+    # The constructor instantiates all the variables that would be used throughout the class
+    def __init__(self, sp, conn):
+        self.sp = sp
+        self.conn = conn
+        self.df = pd.read_csv('music-analysis.csv')
+
+    # Function to add feature columns to the muser data
+    # Replace the existing csv
+    def build_muser_data(self):
+        self.df['acousticness'] = '' * self.df.shape[0]
+        self.df['danceability'] = '' * self.df.shape[0]
+        self.df['energy'] = '' * self.df.shape[0]
+        self.df['instrumentalness'] = '' * self.df.shape[0]
+        self.df['liveness'] = '' * self.df.shape[0]
+        self.df['loudness'] = '' * self.df.shape[0]
+        self.df['speechiness'] = '' * self.df.shape[0]
+        self.df['tempo'] = '' * self.df.shape[0]
+        self.df['valence'] = '' * self.df.shape[0]
+        self.df['popularity'] = '' * self.df.shape[0]
+
+        sleep_min = 2
+        sleep_max = 5
+        request_count = 0
+
+        for idx in self.df.index:
+            album = self.df.loc[idx, 'song_album_name']
+            track = self.df.loc[idx, 'song_name']
+            artist = self.df.loc[idx, 'song_artist_name']
+            query = 'album:{} track:{} artist:{}'.format(album, track, artist)
+            spotify_search = self.sp.search(query, limit=1, offset=0, type='track', market=None)
+
+            request_count += 1
+            if request_count % 5 == 0:
+                time.sleep(np.random.uniform(sleep_min, sleep_max))
+
+            if len(spotify_search['tracks']['items']) > 0:
+                track_uri = spotify_search['tracks']['items'][0]['uri']
+                audio_features = self.sp.audio_features(track_uri)[0]
+                self.df.loc[idx, 'popularity'] = self.sp.track(track_uri)['popularity']
+            else:
+                target = album + ' ' + track + ' ' + artist
+                nlp_model = NLPModel(self.sp, self.conn)
+                audio_features = nlp_model.most_similar_doc(target)
+                self.df.loc[idx, 'popularity'] = audio_features['popularity']
+
+            self.df.loc[idx, 'acousticness'] = audio_features['acousticness']
+            self.df.loc[idx, 'danceability'] = audio_features['danceability']
+            self.df.loc[idx, 'energy'] = audio_features['energy']
+            self.df.loc[idx, 'instrumentalness'] = audio_features['instrumentalness']
+            self.df.loc[idx, 'liveness'] = audio_features['liveness']
+            self.df.loc[idx, 'loudness'] = audio_features['loudness']
+            self.df.loc[idx, 'speechiness'] = audio_features['speechiness']
+            self.df.loc[idx, 'tempo'] = audio_features['tempo']
+            self.df.loc[idx, 'valence'] = audio_features['valence']
+
+        self.df.to_csv('music-analysis.csv')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# __init__ method to make sure that the folder is considered as a python package