diff --git a/CHANGES.rst b/CHANGES.rst index f75556259..8fd035742 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,10 @@ Release Notes Version 0.17.dev0 ----------------- +New features: + +* New `cepstrogram` module, includes `MFCC` data class and processors (#269) + Bug fixes: * `BufferProcessor` can handle data longer than buffer length (#398) diff --git a/README.rst b/README.rst index 11ce5d690..89115c3cd 100644 --- a/README.rst +++ b/README.rst @@ -366,8 +366,16 @@ References Acknowledgements ================ -Supported by the European Commission through the `GiantSteps project -`_ (FP7 grant agreement no. 610591) and the -`Phenicx project `_ (FP7 grant agreement no. 601166) -as well as the `Austrian Science Fund (FWF) `_ project -Z159. +Developed at the `Johannes Kepler University Linz +`_, the +`Austrian Research Institute for Artificial Intelligence (OFAI) +`_, and the `TU Wien `_. + +Supported by the European Commission through the projects `GiantSteps +`_ (FP7 grant agreement no. 610591) and +`Phenicx `_ (FP7 grant agreement no. 601166), the +`Austrian Science Fund (FWF) `_ project +Z159, as well as the `Austrian Research Promotion Agency (FFG) +`_ trough the +`SmarterJam `_ +(BRIDGE 1 grant no. 858514) project. diff --git a/docs/index.rst b/docs/index.rst index 0533d7c77..5a7408b41 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -37,8 +37,16 @@ Indices and tables Acknowledgements ---------------- -Supported by the European Commission through the `GiantSteps project -`_ (FP7 grant agreement no. 610591) and the -`Phenicx project `_ (FP7 grant agreement no. 601166) -as well as the `Austrian Science Fund (FWF) `_ project -Z159. +Developed at the `Johannes Kepler University Linz +`_, the +`Austrian Research Institute for Artificial Intelligence (OFAI) +`_, and the `TU Wien `_. + +Supported by the European Commission through the projects `GiantSteps +`_ (FP7 grant agreement no. 610591) and +`Phenicx `_ (FP7 grant agreement no. 601166), the +`Austrian Science Fund (FWF) `_ project +Z159, as well as the `Austrian Research Promotion Agency (FFG) +`_ trough the +`SmarterJam `_ +(BRIDGE 1 grant no. 858514) project. diff --git a/madmom/audio/__init__.py b/madmom/audio/__init__.py index 69b37d604..c2cc829b9 100644 --- a/madmom/audio/__init__.py +++ b/madmom/audio/__init__.py @@ -24,8 +24,9 @@ class or inherit from madmom.SequentialProcessor or ParallelProcessor. from __future__ import absolute_import, division, print_function # import the submodules -from . import comb_filters, filters, signal, spectrogram, stft +from . import cepstrogram, comb_filters, filters, signal, spectrogram, stft # import classes used often +from .cepstrogram import MFCC, MFCCProcessor from .chroma import DeepChromaProcessor from .signal import (FramedSignal, FramedSignalProcessor, Signal, SignalProcessor, ) diff --git a/madmom/audio/cepstrogram.py b/madmom/audio/cepstrogram.py index 0978a0899..979ab8758 100644 --- a/madmom/audio/cepstrogram.py +++ b/madmom/audio/cepstrogram.py @@ -9,12 +9,15 @@ from __future__ import absolute_import, division, print_function +import inspect + import numpy as np from scipy.fftpack import dct -from ..processors import Processor from .filters import MelFilterbank from .spectrogram import Spectrogram +from ..processors import Processor +from ..utils import lazyprop class Cepstrogram(np.ndarray): @@ -53,8 +56,6 @@ def __new__(cls, spectrogram, transform=dct, **kwargs): obj = np.asarray(data).view(cls) # save additional attributes obj.spectrogram = spectrogram - # TODO: what are the frequencies of the bins? - # obj.bin_frequencies = ??? obj.transform = transform # return the object return obj @@ -64,7 +65,6 @@ def __array_finalize__(self, obj): return # set default values here, also needed for views self.spectrogram = getattr(obj, 'spectrogram', None) - self.bin_frequencies = getattr(obj, 'bin_frequencies', None) self.transform = getattr(obj, 'transform', None) @property @@ -93,7 +93,7 @@ def __init__(self, transform=dct, **kwargs): # pylint: disable=unused-argument self.transform = transform - def process(self, data): + def process(self, data, **kwargs): """ Return a Cepstrogram of the given data. @@ -101,6 +101,8 @@ def process(self, data): ---------- data : numpy array Data to be processed (usually a spectrogram). + kwargs : dict + Keyword arguments passed to :class:`Cepstrogram`. Returns ------- @@ -108,7 +110,11 @@ def process(self, data): Cepstrogram. """ - return Cepstrogram(data, transform=self.transform) + # update arguments passed to Cepstrogram + args = dict(transform=self.transform) + args.update(kwargs) + # instantiate and return Cepstrogram + return Cepstrogram(data, **args) MFCC_BANDS = 30 @@ -116,7 +122,10 @@ def process(self, data): MFCC_FMAX = 15000. MFCC_NORM_FILTERS = True MFCC_MUL = 1. -MFCC_ADD = 0. +MFCC_ADD = np.spacing(1) +MFCC_DCT_NORM = 'ortho' +MFCC_DELTA_FILTER = np.linspace(4, -4, 9) / 60 +MFCC_DELTA_DELTA_FILTER = np.linspace(1, -1, 3) / 2 class MFCC(Cepstrogram): @@ -127,8 +136,6 @@ class MFCC(Cepstrogram): ---------- spectrogram : :class:`.audio.spectrogram.Spectrogram` instance Spectrogram. - transform : numpy ufunc, optional - Transformation applied to the `spectrogram`. filterbank : :class:`.audio.filters.Filterbank` type or instance, optional Filterbank used to filter the `spectrogram`; if a :class:`.audio.filters.Filterbank` type (i.e. class) is given @@ -148,7 +155,9 @@ class MFCC(Cepstrogram): logarithm. add : float, optional Add this value before taking the logarithm of the magnitudes. - kwargs : dict + dct_norm : {'ortho', None}, optional + Normalization mode (see scipy.fftpack.dct). Default is 'ortho'. + kwargs : dict, optional If no :class:`.audio.spectrogram.Spectrogram` instance was given, one is instantiated and these keyword arguments are passed. @@ -176,17 +185,17 @@ class MFCC(Cepstrogram): # pylint: disable=super-init-not-called # pylint: disable=attribute-defined-outside-init - def __init__(self, spectrogram, transform=dct, filterbank=MelFilterbank, + def __init__(self, spectrogram, filterbank=MelFilterbank, num_bands=MFCC_BANDS, fmin=MFCC_FMIN, fmax=MFCC_FMAX, norm_filters=MFCC_NORM_FILTERS, mul=MFCC_MUL, add=MFCC_ADD, - **kwargs): + dct_norm=MFCC_DCT_NORM, **kwargs): # this method is for documentation purposes only pass - def __new__(cls, spectrogram, transform=dct, filterbank=MelFilterbank, + def __new__(cls, spectrogram, filterbank=MelFilterbank, num_bands=MFCC_BANDS, fmin=MFCC_FMIN, fmax=MFCC_FMAX, norm_filters=MFCC_NORM_FILTERS, mul=MFCC_MUL, add=MFCC_ADD, - **kwargs): + dct_norm=MFCC_DCT_NORM, **kwargs): # for signature documentation see __init__() from .filters import Filterbank # instantiate a Spectrogram if needed @@ -194,17 +203,8 @@ def __new__(cls, spectrogram, transform=dct, filterbank=MelFilterbank, # try to instantiate a Spectrogram object spectrogram = Spectrogram(spectrogram, **kwargs) - # recalculate the spec if it is filtered or scaled already - if (spectrogram.filterbank is not None or - spectrogram.mul is not None or - spectrogram.add is not None): - import warnings - warnings.warn('Spectrogram was filtered or scaled already, redo ' - 'calculation!') - spectrogram = Spectrogram(spectrogram.stft) - # instantiate a Filterbank if needed - if issubclass(filterbank, Filterbank): + if inspect.isclass(filterbank) and issubclass(filterbank, Filterbank): # create a filterbank of the given type filterbank = filterbank(spectrogram.bin_frequencies, num_bands=num_bands, fmin=fmin, fmax=fmax, @@ -216,13 +216,12 @@ def __new__(cls, spectrogram, transform=dct, filterbank=MelFilterbank, # filter the spectrogram data = np.dot(spectrogram, filterbank) # logarithmically scale the magnitudes - np.log10(mul * data + add, out=data) - # apply the transformation - data = transform(data) + np.log(mul * data + add, out=data) + # apply type 2 DCT + data = dct(data, norm=dct_norm) # cast as MFCC obj = np.asarray(data).view(cls) # save additional attributes - obj.transform = transform obj.spectrogram = spectrogram obj.filterbank = filterbank obj.mul = mul @@ -230,6 +229,126 @@ def __new__(cls, spectrogram, transform=dct, filterbank=MelFilterbank, # return the object return obj + @staticmethod + def calc_deltas(data, delta_filter): + """ + Apply the given filter to the data after automatically padding by + replicating the first and last frame. The length of the padding is + calculated via ceil(len(delta_filter)). + + Applying a filter means passing the matrix column after column to + ``np.convolve()``. Afterwards the array is truncated to the same + shape as the input array. + + Parameters + ---------- + data: numpy array + Data to process, i.e. MFCCs or deltas thereof. + delta_filter: numpy array + Filter used for convolution. + + Returns + ------- + deltas: numpy array + Deltas of `data`, same shape as `data`. + + """ + # pad data by replicating the first and the last frame + k = int(np.ceil(len(delta_filter) / 2)) + padded = np.vstack((np.array([data[0], ] * k), data, + np.array([data[-1], ] * k))) + # calculate the deltas for each coefficient + deltas = [] + for band in padded.T: + deltas.append(np.convolve(band, delta_filter, 'same')) + # return deltas (first/last k frames truncated) + return np.vstack(deltas).T[k:-k] + + @lazyprop + def deltas(self, delta_filter=MFCC_DELTA_FILTER): + """ + First order derivative of the MFCCs. + + Parameters + ---------- + delta_filter: numpy array, optional + Filter to calculate the derivative of the MFCCs. + + Returns + ------- + deltas: numpy array + Deltas of the MFCCs, same shape as MFCCs. + + Notes + ----- + Accessing this property corresponds to the function call + ``MFCC.calc_deltas(mfccs, delta_filter)``, with results being cached. + + """ + return MFCC.calc_deltas(self, delta_filter) + + @lazyprop + def delta_deltas(self, delta_delta_filter=MFCC_DELTA_DELTA_FILTER): + """ + Second order derivatives of the MFCCs. + + Parameters + ---------- + delta_delta_filter: numpy array, optional + Filter to calculate the derivative of the derivative. + + Returns + ------- + deltas: numpy array + Delta deltas of the MFCCs, same shape as MFCCs. + + Notes + ----- + Accessing this property corresponds to the function call + ``MFCC.calc_deltas(deltas, delta_delta_filter)``, with results being + cached. + + """ + return MFCC.calc_deltas(self.deltas, delta_delta_filter) + + def calc_voicebox_deltas(self, delta_filter=MFCC_DELTA_FILTER, + delta_delta_filter=MFCC_DELTA_DELTA_FILTER): + """ + Calculates deltas and delta deltas the way it is done in the voicebox + MatLab toolbox [1]_. + + Parameters + ---------- + delta_filter : numpy array + Filter to calculate the derivative of the MFCCs. + delta_delta_filter : numpy array + Filter to calculate the derivative of the derivative. + + Returns + ------- + [mfcc, delta, delta_delta] : numpy array, shape (num_frames, bands * 3) + Horizontally stacked array consisting of the MFCC coefficients, + their first and second order derivatives. + + References + ---------- + .. [1] http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html + + """ + padded_input = np.vstack( + (np.array([self[0], ] * 5), self, np.array([self[-1], ] * 5))) + deltashape = tuple(reversed(padded_input.shape)) + flat_input = padded_input.transpose().flatten() + deltas = np.convolve(flat_input, delta_filter, mode='same') + deltas = deltas.reshape(deltashape).T[4:-4, ] + deltadeltashape = tuple(reversed(deltas.shape)) + flat_deltas = deltas.transpose().flatten() + deltas = deltas[1:-1, ] + delta_deltas = np.convolve(flat_deltas, delta_delta_filter, + mode='same') + delta_deltas = delta_deltas.reshape(deltadeltashape).T[1:-1, ] + return np.hstack((self, deltas, delta_deltas)) + def __array_finalize__(self, obj): if obj is None: return @@ -242,9 +361,8 @@ def __array_finalize__(self, obj): class MFCCProcessor(Processor): """ - MFCCProcessor is CepstrogramProcessor which filters the magnitude - spectrogram of the spectrogram with a Mel filterbank, takes the logarithm - and performs a discrete cosine transform afterwards. + MFCCProcessor filters the magnitude spectrogram with a Mel filterbank, + takes the logarithm and performs a discrete cosine transform afterwards. Parameters ---------- @@ -261,14 +379,12 @@ class MFCCProcessor(Processor): logarithm. add : float, optional Add this value before taking the logarithm of the magnitudes. - transform : numpy ufunc - Transformation applied to the Mel filtered spectrogram. """ def __init__(self, num_bands=MFCC_BANDS, fmin=MFCC_FMIN, fmax=MFCC_FMAX, norm_filters=MFCC_NORM_FILTERS, mul=MFCC_MUL, add=MFCC_ADD, - transform=dct, **kwargs): + **kwargs): # pylint: disable=unused-argument self.num_bands = num_bands self.fmin = fmin @@ -276,16 +392,19 @@ def __init__(self, num_bands=MFCC_BANDS, fmin=MFCC_FMIN, fmax=MFCC_FMAX, self.norm_filters = norm_filters self.mul = mul self.add = add - self.transform = transform + # TODO: add filterbank argument to the processor? + self.filterbank = None # needed for caching - def process(self, data): + def process(self, data, **kwargs): """ Process the data and return the MFCCs of it. Parameters ---------- data : numpy array - Data to be processed (usually a spectrogram). + Data to be processed (a spectrogram). + kwargs : dict, optional + Keyword arguments passed to :class:`MFCC`. Returns ------- @@ -293,6 +412,15 @@ def process(self, data): MFCCs of the data. """ - return MFCC(data, num_bands=self.num_bands, fmin=self.fmin, - fmax=self.fmax, norm_filters=self.norm_filters, - mul=self.mul, add=self.add) + # update arguments passed to MFCCs + # TODO: if these arguments change, the filterbank needs to be discarded + args = dict(num_bands=self.num_bands, fmin=self.fmin, fmax=self.fmax, + norm_filters=self.norm_filters, mul=self.mul, add=self.add, + filterbank=self.filterbank) + args.update(kwargs) + # instantiate MFCCs + data = MFCC(data, **args) + # cache the filterbank + self.filterbank = data.filterbank + # return MFCCs + return data diff --git a/madmom/utils/__init__.py b/madmom/utils/__init__.py index 46c0893e5..74309b70e 100644 --- a/madmom/utils/__init__.py +++ b/madmom/utils/__init__.py @@ -661,5 +661,34 @@ def segment_axis(signal, frame_size, hop_size, axis=None, end='cut', dtype=signal.dtype) +# taken from: https://stackoverflow.com/questions/3012421/ +def lazyprop(fn): + """ + A decorator for a caching, lazily evaluated property. If a function is + decorated with @lazyprop, the original function of the resulting property + is only called on the first access. Afterwards the result which was + produced then is returned again. + + Parameters + ---------- + fn: Function + A function without argument which returns the value of the property + + Returns + ------- + property + A property which wraps the original one and caches it first result + """ + attr_name = '_lazy_' + fn.__name__ + + @property + def _lazyprop(self): + if not hasattr(self, attr_name): + setattr(self, attr_name, fn(self)) + return getattr(self, attr_name) + + return _lazyprop + + # keep namespace clean del contextlib diff --git a/tests/test_audio_cepstrogram.py b/tests/test_audio_cepstrogram.py new file mode 100644 index 000000000..ed7569896 --- /dev/null +++ b/tests/test_audio_cepstrogram.py @@ -0,0 +1,72 @@ +# encoding: utf-8 +# pylint: skip-file +""" +This file contains tests for the madmom.audio.cepstrogram module. + +""" + +from __future__ import absolute_import, division, print_function + +import unittest +from functools import partial +from os.path import join as pj + +from madmom.audio.cepstrogram import MFCC, Cepstrogram +from madmom.audio.filters import MelFilterbank +from madmom.audio.spectrogram import * +from . import AUDIO_PATH + +sample_file = pj(AUDIO_PATH, 'sample.wav') + + +class TestMFCCClass(unittest.TestCase): + + def setUp(self): + self.mfcc = MFCC(sample_file) + + def test_types(self): + self.assertIsInstance(self.mfcc, MFCC) + self.assertIsInstance(self.mfcc, Cepstrogram) + # attributes + self.assertIsInstance(self.mfcc.filterbank, MelFilterbank) + # properties + self.assertIsInstance(self.mfcc.deltas, np.ndarray) + self.assertIsInstance(self.mfcc.delta_deltas, np.ndarray) + self.assertIsInstance(self.mfcc.num_bins, int) + self.assertIsInstance(self.mfcc.num_frames, int) + # wrong filterbank type + with self.assertRaises(TypeError): + FilteredSpectrogram(sample_file, filterbank='bla') + + def test_values(self): + allclose = partial(np.allclose, rtol=1.e-3, atol=1.e-5) + # values + self.assertTrue(allclose(self.mfcc[0, :6], + [-3.61102366, 6.81075716, 2.55457568, + 1.88377929, 1.04133379, 0.6382336])) + self.assertTrue(allclose(self.mfcc[0, -6:], + [-0.20386486, -0.18468723, -0.00233107, + 0.20703268, 0.21419463, 0.00598407])) + # attributes + self.assertTrue(self.mfcc.shape == (281, 30)) + # properties + self.assertEqual(self.mfcc.num_bins, 30) + self.assertEqual(self.mfcc.num_frames, 281) + + def test_deltas(self): + allclose = partial(np.allclose, rtol=1.e-2, atol=1.e-4) + # don't compare first element because it is dependent on the + # padding used for filtering + self.assertTrue(allclose(self.mfcc.deltas[1, :6], + [-0.02286286, -0.11329014, 0.05381977, + 0.10438456, 0.04268386, -0.06839912])) + self.assertTrue(allclose(self.mfcc.deltas[1, -6:], + [-0.03156065, -0.019716, -0.03417692, + -0.07768068, -0.05539324, -0.02616282])) + # delta deltas + self.assertTrue(allclose(self.mfcc.delta_deltas[1, :6], + [-0.00804922, -0.009922, -0.00454391, + 0.0038989, 0.00254525, 0.0120557])) + self.assertTrue(allclose(self.mfcc.delta_deltas[1, -6:], + [0.0072148, 0.00094424, 0.00029913, + 0.00530994, 0.00184207, -0.00276511]))