Skip to content

Music audio sample classification using the Tensorflow Keras machine learning library.

Notifications You must be signed in to change notification settings

pl728/audio-sample-classification

Repository files navigation

In [1]:

import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout

In [2]:

# parse the kicks, crashes 
# 0 = kick, 1 = crash

feature = []
label = []

for i in range(1, 202):
    d, sr = librosa.load("sample_kick/VEH1 Hard Kick - " + str(i).zfill(3) + ".wav", sr=44100, res_type='kaiser_fast')
    mels = np.mean(librosa.feature.melspectrogram(y=d, sr=sr).T, axis=0)
    feature.append(mels)
    label.append(0)

for i in range(1, 51):
    d, sr = librosa.load("sample_crash/VEH1 Crash - " + str(i).zfill(2) + ".wav", sr=44100, res_type='kaiser_fast')
    mels = np.mean(librosa.feature.melspectrogram(y=d, sr=sr).T, axis=0)
    feature.append(mels)
    label.append(1)
    
print(feature[0].shape)
    


data = {
    "X": np.array(feature),
    "t": np.array(label)
}

data["t"] = tf.keras.utils.to_categorical(data["t"])
print(data["X"])

(128,)
[[4.9364899e+02 1.3382242e+03 8.3680701e+02 ... 8.0354203e-06
  8.5975907e-06 6.0973957e-06]
 [2.1349692e+02 9.8276776e+02 5.0937537e+02 ... 1.7508426e-03
  2.7589572e-03 8.0489012e-04]
 [3.4847284e+02 9.0860150e+02 3.3291241e+02 ... 4.1663774e-05
  7.3597308e-05 2.4802917e-05]
 ...
 [2.2502916e-05 8.3084352e-04 2.2441533e-03 ... 4.6322175e-08
  4.7188419e-08 5.0364477e-08]
 [2.0987345e-02 6.6036671e-02 1.5265882e-01 ... 2.5161073e-06
  2.5729428e-06 2.3973475e-06]
 [7.0509864e-03 2.2733379e-02 3.5233915e-02 ... 7.3609786e-04
  7.8171847e-04 1.1218864e-03]]

In [3]:

X_train, X_test, Y_train, Y_test = train_test_split(data["X"], data["t"], random_state=1)
print(X_train.shape)
X_train = X_train.reshape(188, 16, 8, 1)
X_test = X_test.reshape(63, 16, 8, 1)
print(X_train.shape)

(188, 128)
(188, 16, 8, 1)

In [4]:

input_dim = (16, 8, 1)

In [5]:

model = Sequential()

model.add(Conv2D(64, (3, 3), padding = "same", activation = "tanh", input_shape = input_dim))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), padding = "same", activation = "tanh"))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(1024, activation = "tanh"))
model.add(Dense(2, activation = "softmax"))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.fit(X_train, Y_train, epochs = 3, batch_size = 1, validation_data = (X_test, Y_test))
model.summary()

Train on 188 samples, validate on 63 samples
Epoch 1/3
188/188 [==============================] - 4s 20ms/sample - loss: 0.3363 - accuracy: 0.9787 - val_loss: 3.6908e-04 - val_accuracy: 1.0000
Epoch 2/3
188/188 [==============================] - 3s 17ms/sample - loss: 6.0605e-05 - accuracy: 1.0000 - val_loss: 3.2526e-04 - val_accuracy: 1.0000
Epoch 3/3
188/188 [==============================] - 3s 18ms/sample - loss: 6.7195e-05 - accuracy: 1.0000 - val_loss: 2.0428e-04 - val_accuracy: 1.0000
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 16, 8, 64)         640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 8, 4, 64)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 4, 128)         73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 2, 128)         0         
_________________________________________________________________
dropout (Dropout)            (None, 4, 2, 128)         0         
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 1024)              1049600   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 2050      
=================================================================
Total params: 1,126,146
Trainable params: 1,126,146
Non-trainable params: 0
_________________________________________________________________

In [6]:

# predictions = model.predict(X_test)
score = model.evaluate(X_test, Y_test)
print(score)

63/63 [==============================] - 0s 5ms/sample - loss: 2.0428e-04 - accuracy: 1.0000
[0.0002042830337069561, 1.0]

In [7]:

model.save("kick-crash-classifier")

About

Music audio sample classification using the Tensorflow Keras machine learning library.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published