diff --git a/README.md b/README.md index 437d70a..d9de196 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,9 @@ PyMIR is a Python library for common tasks in Music Information Retrieval (MIR) * RMS * Spectrum (FFT) * Zero-crossing rate + * Linear Predictive Components (LPC) + * Linear Predictive Cepstral Components (LPCC) from LPC + * Line Spectrum Pairs (LSP) / Line Spectrum Frequencies (LSF) from LPC * Spectral feature extraction (Spectrum class) * Spectral Centroid * Spectral Flatness @@ -34,6 +37,7 @@ PyMIR is a Python library for common tasks in Music Information Retrieval (MIR) * Naive pitch estimation * Onset detectors (energy, flux) * Spectral Flux + * Delta computation of features (Useful for speech processing) ## Examples @@ -72,6 +76,9 @@ The standard workflow for working with PyMIR is: fixedFrames[0].plot() # Plot using matplotlib fixedFrames[0].rms() # Root-mean-squared amplitude fixedFrames[0].zcr() # Zero-crossing raate + fixedFrames[0].lpc() # LPC, with order = len(fixedFrames[0])-1 + fixedFrames[0].lpcc() # LPCC, with order = len(fixedFrames[0])-1 + fixedFrames[0].lsp() # LSP/LSF, with order = len(fixedFrames[0])-1 ### Extracting spectral features # Compute the spectra of each frame @@ -96,6 +103,12 @@ The standard workflow for working with PyMIR is: # Compute the spectral flux flux = SpectralFlux.spectralFlux(spectra, rectify = True) + from pymir.Deltas import getDeltas + # Computing delta and delta-deltas + deltas = getDeltas([1,2,3,4,5]) + print deltas # array([ 1. , 2. , 3. , 4. , 5. , 0.5 , 0.8 , 1. , 0.8 , + 0.5 , 0.13, 0.11, 0. , -0.11, -0.13]) + ### Audio playback Playback is provided on all AudioFile and Frame objects. Internal representation is 32-bit floating point. @@ -107,7 +120,7 @@ Playback is provided on all AudioFile and Frame objects. Internal representation Naive chord estimation using a dictionary of the 24 major and minor triads only, represented as normalized chroma vectors. Similarity is measured using the cosine similarity function. The closest -match is returned (as a string). +match is returned (as a string). This is called a naive approach because it does not consider preceding chords, which could improve chord estimation accuracy. diff --git a/pymir/Deltas.py b/pymir/Deltas.py new file mode 100644 index 0000000..fe3b2df --- /dev/null +++ b/pymir/Deltas.py @@ -0,0 +1,44 @@ +import numpy as np + + +def getDeltas(seq, derivative=2, winsize=2): + # First stack the static features + ret = seq[:] + for i in xrange(derivative): + seq = _getSingleDeltas(seq) + ret.extend(seq) + return ret + + +def _getSingleDeltas(feature, winsize=2): + ''' + Calculates a single pass deltas for the given feature + returns the calculated feature stacked upon the given feature + ''' + ret = [] + # Calculates the denominator: 2* \sum_n^N n*n + denom = 2. * sum(x**2 for x in xrange(1, winsize + 1)) + # iterate over all frames + for frameindex in xrange(len(feature)): + # We calculate the difference in between two frames + # In the border case of having the current frame is < winsize, we use the + # Current frame as the "replacement" effectively exting the array left and right by + # the frames at the positions +- winsize + fwd = bwd = feature[frameindex] + innersum = 0 + # Winsize will range between 1 and winsize+1, since we want to have the + # adjacent frames + for k in xrange(1, winsize + 1): + # Check if our features are in range, if not we use the default + # setting + # Since one of the features will certainly be not out of range ( + # except having + # a zero or one length frame length), we don't get any zeros in the + # result + if frameindex + k < len(feature): + fwd = feature[frameindex + k] + if frameindex - k >= 0: + bwd = feature[frameindex - k] + innersum += k * (fwd - bwd) + ret.append(innersum / denom) + return ret diff --git a/pymir/Frame.py b/pymir/Frame.py index 2addb68..0412b70 100644 --- a/pymir/Frame.py +++ b/pymir/Frame.py @@ -11,32 +11,32 @@ from numpy import * from numpy.lib import stride_tricks -import scipy import matplotlib.pyplot as plt import pymir -from pymir import Spectrum, Transforms +from pymir import Spectrum, Transforms, LinearPredictiveAnalysis import pyaudio + class Frame(numpy.ndarray): - + def __new__(subtype, shape, dtype=float, buffer=None, offset=0, - strides=None, order=None): + strides=None, order=None): # Create the ndarray instance of our type, given the usual # ndarray input arguments. This will call the standard # ndarray constructor, but return an object of our type. # It also triggers a call to InfoArray.__array_finalize__ obj = numpy.ndarray.__new__(subtype, shape, dtype, buffer, offset, strides, - order) - + order) + obj.sampleRate = 0 obj.channels = 1 obj.format = pyaudio.paFloat32 - + # Finally, we must return the newly created object: return obj - + def __array_finalize__(self, obj): # ``self`` is a new object resulting from # ndarray.__new__(InfoArray, ...), therefore it only has @@ -49,7 +49,8 @@ def __array_finalize__(self, obj): # (we're in the middle of the InfoArray.__new__ # constructor, and self.info will be set when we return to # InfoArray.__new__) - if obj is None: return + if obj is None: + return # From view casting - e.g arr.view(InfoArray): # obj is arr # (type(obj) can be InfoArray) @@ -61,30 +62,30 @@ def __array_finalize__(self, obj): # method sees all creation of default objects - with the # InfoArray.__new__ constructor, but also with # arr.view(InfoArray). - + self.sampleRate = getattr(obj, 'sampleRate', None) self.channels = getattr(obj, 'channels', None) self.format = getattr(obj, 'format', None) - + # We do not need to return anything - + ##################### # Frame methods ##################### - + def cqt(self): """ Compute the Constant Q Transform (CQT) """ return Transforms.cqt(self) - + def dct(self): """ Compute the Discrete Cosine Transform (DCT) """ return Transforms.dct(self) - - def energy(self, windowSize = 256): + + def energy(self, windowSize=256): """ Compute the energy of this frame """ @@ -92,17 +93,89 @@ def energy(self, windowSize = 256): window = numpy.hamming(windowSize) window.shape = (windowSize, 1) - - n = N - windowSize #number of windowed samples. - - # Create a view of signal who's shape is (n, windowSize). Use stride_tricks such that each stide jumps only one item. - p = numpy.power(self,2) - s = stride_tricks.as_strided(p,shape=(n, windowSize), strides=(self.itemsize, self.itemsize)) + + n = N - windowSize # number of windowed samples. + + # Create a view of signal who's shape is (n, windowSize). Use + # stride_tricks such that each stide jumps only one item. + p = numpy.power(self, 2) + s = stride_tricks.as_strided( + p, shape=(n, windowSize), strides=(self.itemsize, self.itemsize)) e = numpy.dot(s, window) / windowSize e.shape = (e.shape[0], ) return e - - def frames(self, frameSize, windowFunction = None): + + def lpcc(self, lpcorder=None, cepsorder=None): + ''' + Function: lpcc + Summary: Computes the linear predictive cepstral compoents. Note: Returned values are in the frequency domain. LPCC is computed through LPC. + Examples: audiofile = AudioFile.open('file.wav',16000) + frames = audiofile.frames(512,np.hamming) + for frame in frames: + frame.lpcc() + Attributes: + @param (self): + @param (lpcorder) default=None: The input order to compute the LPC coefficents. + @param (cepsorder) default=None: The output order to compute the LPCC coefficents. + Returns: A list of LPCC components with size order +1 or len(seq), depending on if cepsorder is None + ''' + coefs, err_term = LinearPredictiveAnalysis.lpc(self, lpcorder) + return LinearPredictiveAnalysis.lpcc(coefs, err_term, cepsorder) + + def lpc(self, order=None): + ''' + Function: lpc + Summary: Computes for each given sequence the LPC ( Linear predictive components ) sequence. + Examples: audiofile = AudioFile.open('file.wav',16000) + frames = audiofile.frames(512,np.hamming) + for frame in frames: + frame.lpc() + Attributes: + @param (seq):A sequence of time-domain frames, usually obtained by .frames() + @param (order) default=None: Size of the returning cepstral components. If None is given, + we use len(seq) as default, otherwise order +1 + Returns: A list of lpc coefficents + ''' + # Only return the coefficients not the error term (in [1]) + return LinearPredictiveAnalysis.lpc(self, order)[0] + + def lsp(self,order=None,rectify=True): + ''' + Function: lsp + Summary: Computes Line spectrum pairs ( also called line spectral frequencies [lsf]). Does not use any fancy algorithm except np.roots to solve + for the zeros of the given polynom A(z) = 0.5(P(z) + Q(z)) + Examples: audiofile = AudioFile.open('file.wav',16000) + frames = audiofile.frames(512,np.hamming) + for frame in frames: + frame.lsp() + Attributes: + @param (self): + @param (order) default=None:Order of lpc coefficients. Return array has size order + 1. Default is the length of the current frame + @param (rectify) default=True: Specifies if the return values are only positive. If rectify is False it also returns the (symmetric) negative values + Returns: A list of size order/ len(frames) (if nothing is specifed), which represents the line spectrum pairs. + ''' + coefs, _ = LinearPredictiveAnalysis.lpc(self, order) + return LinearPredictiveAnalysis.lsp(coefs,rectify) + + def autocorr(self, order=None): + ''' + Function: autocorr + Summary: Calculates the autocorrelation with the given order + Examples: f = AudioFile.open('audiofile.wav',16000) + for frame in f.frames(512,numpy.hamming): + frame.autocorr() + Attributes: + @param (self): + @param (order) default=None: The order ( order +1 is length of the returned array) of the auto correlation. + If order is None we use len(frame)-1 as default + Returns:Array of length order +1 with the autocorrelation coefficients + ''' + if order is None: + order = len(self) - 1 + return [sum(self[n] * self[n + tau] for n in xrange(len(self) - tau)) + for tau in xrange(order + 1)] + + def frames(self, frameSize, windowFunction=None): """ Decompose this frame into smaller frames of size frameSize """ @@ -110,13 +183,13 @@ def frames(self, frameSize, windowFunction = None): start = 0 end = frameSize while start < len(self): - + if windowFunction == None: frames.append(self[start:end]) else: window = windowFunction(frameSize) window.shape = (frameSize, 1) - window = numpy.squeeze(window) + window = numpy.squeeze(window) frame = self[start:end] if len(frame) < len(window): # Zero pad @@ -128,7 +201,7 @@ def frames(self, frameSize, windowFunction = None): diff = len(window) - len(frame) frame = numpy.append(frame, [0] * diff) - + if frameType == "AudioFile": frame = frame.view(pymir.AudioFile) else: @@ -138,22 +211,22 @@ def frames(self, frameSize, windowFunction = None): frame.sampleRate = sampleRate frame.channels = channels frame.format = format - + windowedFrame = frame * window frames.append(windowedFrame) start = start + frameSize end = end + frameSize - + return frames - + def framesFromOnsets(self, onsets): """ Decompose into frames based on onset start time-series """ frames = [] for i in range(0, len(onsets) - 1): - frames.append(self[onsets[i] : onsets[i + 1]]) + frames.append(self[onsets[i]: onsets[i + 1]]) return frames @@ -164,7 +237,8 @@ def play(self): """ # Create the stream p = pyaudio.PyAudio() - stream = p.open(format = self.format, channels = self.channels, rate = self.sampleRate, output = True) + stream = p.open( + format=self.format, channels=self.channels, rate=self.sampleRate, output=True) # Write the audio data to the stream audioData = self.tostring() @@ -191,11 +265,11 @@ def rms(self): sum = 0 for i in range(0, len(self)): sum = sum + self[i] ** 2 - + sum = sum / (1.0 * len(self)) - + return math.sqrt(sum) - + # Spectrum def spectrum(self): """ @@ -212,5 +286,5 @@ def zcr(self): for i in range(1, len(self)): if (self[i - 1] * self[i]) < 0: zcr = zcr + 1 - - return zcr / (1.0 * len(self)) \ No newline at end of file + + return zcr / (1.0 * len(self)) diff --git a/pymir/LinearPredictiveAnalysis.py b/pymir/LinearPredictiveAnalysis.py new file mode 100644 index 0000000..0e62e6b --- /dev/null +++ b/pymir/LinearPredictiveAnalysis.py @@ -0,0 +1,99 @@ +import numpy as np +import pymir +from scipy.linalg import toeplitz +import itertools + +def lpc(seq, order=None): + ''' + Function: lpc + Summary: Computes for each given sequence the LPC ( Linear predictive components ) sequence. + Examples: + Attributes: + @param (seq):A sequence of time-domain frames, usually obtained by .frames() + @param (order) default=None: Size of the returning cepstral components. If None is given, + we use len(seq) as default, otherwise order +1 + Returns: A tuple, which elements are (lpc coefficents,error_term). The error term is the sqare root of the squared prediction error. + ''' + # In this lpc method we use the slow( if the order is >50) autocorrelation approach. + acseq = np.array(pymir.Frame.autocorr(seq, order)) + # Using pseudoinverse to obtain a stable estiamte of the toeplitz matrix + a_coef = np.dot(np.linalg.pinv(toeplitz(acseq[:-1])), -acseq[1:].T) + # Squared prediction error, defined as e[n] = a[n] + \sum_k=1^order (a_k * + # s_{n-k}) + err_term = acseq[0] + sum(a * c for a, c in zip(acseq[1:], a_coef)) + return a_coef.tolist(), np.sqrt(err_term) + + +def lpcc(seq, err_term, order=None): + ''' + Function: lpcc + Summary: Computes the linear predictive cepstral compoents. Note: Returned values are in the frequency domain + Examples: audiofile = AudioFile.open('file.wav',16000) + frames = audiofile.frames(512,np.hamming) + for frame in frames: + frame.lpcc() + Note that we already preprocess in the Frame class the lpc conversion! + Attributes: + @param (seq):A sequence of lpc components. Need to be preprocessed by lpc() + @param (err_term):Error term for lpc sequence. Returned by lpc()[1] + @param (order) default=None: Return size of the array. Function returns order+1 length array. Default is len(seq) + Returns: List with lpcc components with default length len(seq), otherwise length order +1 + ''' + if order is None: + order = len(seq) - 1 + lpcc_coeffs = [np.log(err_term), -seq[0]] + for n in xrange(2, order + 1): + # Use order + 1 as upper bound for the last iteration + upbound = (order + 1 if n > order else n) + lpcc_coef = -sum(i * lpcc_coeffs[i] * seq[n - i - 1] + for i in xrange(1, upbound)) * 1. / upbound + lpcc_coef -= seq[n - 1] if n <= len(seq) else 0 + lpcc_coeffs.append(lpcc_coef) + return lpcc_coeffs + +def lsp(lpcseq,rectify=True): + ''' + Function: lsp + Summary: Computes Line spectrum pairs ( also called line spectral frequencies [lsf]). Does not use any fancy algorithm except np.roots to solve + for the zeros of the given polynom A(z) = 0.5(P(z) + Q(z)) + Examples: audiofile = AudioFile.open('file.wav',16000) + frames = audiofile.frames(512,np.hamming) + for frame in frames: + frame.lpcc() + Attributes: + @param (lpcseq):The sequence of lpc coefficients as \sum_k=1^{p} a_k z^{-k} + @param (rectify) default=True: If true returns only the values >= 0, since the result is symmetric. If all values are wished, specify rectify = False + Returns: A list with same length as lpcseq (if rectify = True), otherwise 2*len(lpcseq), which represents the line spectrum pairs + ''' + # We obtain 1 - A(z) +/- z^-(p+1) * (1 - A(z)) + # After multiplying everything through it becomes + # 1 - \sum_k=1^p a_k z^{-k} +/- z^-(p+1) - \sum_k=1^p a_k z^{k-(p+1)} + # Thus we have on the LHS the usual lpc polynomial and on the RHS we need to reverse the coefficient order + # We assume further that the lpc polynomial is a valid one ( first coefficient is 1! ) + + # the rhs does not have a constant expression and we reverse the coefficients + rhs = [0] + lpcseq[::-1] + [1] + # The P polynomial + P = [] + # The Q polynomial + Q = [] + # Assuming constant coefficient is 1, which is required. Moreover z^{-p+1} does not exist on the lhs, thus appending 0 + lpcseq = [1] + lpcseq[:] + [0] + for l,r in itertools.izip_longest(lpcseq,rhs): + P.append(l + r) + Q.append(l - r) + # Find the roots of the polynomials P,Q ( numpy assumes we have the form of: p[0] * x**n + p[1] * x**(n-1) + ... + p[n-1]*x + p[n] + # mso we need to reverse the order) + p_roots = np.roots(P[::-1]) + q_roots = np.roots(Q[::-1]) + # Keep the roots in order + lsf_p = sorted(np.angle(p_roots)) + lsf_q = sorted(np.angle(q_roots)) + # print sorted(lsf_p+lsf_q),len([i for i in lsf_p+lsf_q if i > 0.]) + if rectify: + # We only return the positive elements, and also remove the final Pi (3.14) value at the end, + # since it always occurs + return sorted(i for i in lsf_q + lsf_p if (i > 0))[:-1] + else: + # Remove the -Pi and +pi at the beginning and end in the list + return sorted(i for i in lsf_q + lsf_p)[1:-1]