-
Notifications
You must be signed in to change notification settings - Fork 75
MFCC and Mel Spectrogram
Tim Sharii edited this page Nov 19, 2021
·
8 revisions
Scheme:
- Spectrum: power spectrum (default) or magnitude spectrum
- Mel Filterbank: Slaney (default) or HTK-like
- Nonlinearity: 10*Log10()
- DCT: 2N (default)
pyTorch:
n_fft = 2048
win_length = None
hop_length = 512
n_mels = 40
n_mfcc = 13
mfcc_transform = T.MFCC(
sample_rate=sample_rate,
n_mfcc=n_mfcc,
melkwargs={
'n_fft': n_fft,
'n_mels': n_mels,
'hop_length': hop_length
}
)
mfcc = mfcc_transform(waveform)
TODO
Scheme:
- Spectrum: power spectrum
- Mel Filterbank: HTK-like
- Nonlinearity: Ln()
- DCT: 2N
- and a lot of other parameters
m = torchaudio.compliance.kaldi.mfcc(waveform, frame_length=25, window_type='blackman', preemphasis_coefficient=0.97, remove_dc_offset=False)
Scheme:
- Spectrum: power spectrum (normalized)
- Mel Filterbank: essentially HTK-like (but slightly different)
- Nonlinearity: Ln()
- DCT: 2N
# s = s / 32768 # if samples are not in range [-1, 1]
# (or don't normalize signal in NWaves)
mfcc(
signal,
samplerate=16000,
winlen=0.025,
winstep=0.01,
numcep=16,
nfilt=26,
nfft=512,
lowfreq=0,
highfreq=None,
preemph=0.97,
ceplifter=22,
appendEnergy=False,
winfunc=numpy.hanning
)
NWaves:
var samplingRate = 16000;
var fftSize = 512;
var melCount = 26;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 16,
FrameDuration = 0.025,
HopDuration = 0.01,
FilterBankSize = melCount,
SpectrumType = SpectrumType.PowerNormalized,
NonLinearity = NonLinearityType.LogE,
DctType = "2N",
FftSize = fftSize,
PreEmphasis = 0.97,
LifterSize = 22,
IncludeEnergy = false,
Window = WindowType.Hann,
};
- Correction for the first coefficient:
- if
appendEnergy=False (IncludeEnergy = false)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)(Math.Log(2) * Math.Sqrt(melCount));
}
- if
appendEnergy=True (IncludeEnergy = true)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)Math.Log(2);
}
- Set the filterbank explicitly (instead of
FilterBankSize = melCount
)
FilterBank = PsfFilterbank(samplingRate, melCount, fftSize)
.
The code of PsfFilterbank
function:
float[][] PsfFilterbank(int samplingRate, int filterbankSize, int fftSize, double lowFreq = 0, double highFreq = 0)
{
var filterbank = new float[filterbankSize][];
if (highFreq <= lowFreq)
{
highFreq = samplingRate / 2;
}
var low = NWaves.Utils.Scale.HerzToMel(lowFreq);
var high = NWaves.Utils.Scale.HerzToMel(highFreq);
var res = (fftSize + 1) / (float)samplingRate;
var bins = Enumerable
.Range(0, filterbankSize + 2)
.Select(i => (float)Math.Floor(res * NWaves.Utils.Scale.MelToHerz(i * (high - low) / (filterbankSize + 1))))
.ToArray();
for (var i = 0; i < filterbankSize; i++)
{
filterbank[i] = new float[fftSize / 2 + 1];
for (var j = (int)bins[i]; j < (int)bins[i + 1]; j++)
{
filterbank[i][j] = (j - bins[i]) / (bins[i + 1] - bins[i]);
}
for (var j = (int)bins[i + 1]; j < (int)bins[i + 2]; j++)
{
filterbank[i][j] = (bins[i + 2] - j) / (bins[i + 2] - bins[i + 1]);
}
}
return filterbank;
}
pyTorch:
import torchaudio.transforms as T
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
mel_spectrogram = T.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
power=2.0,
n_mels=n_mels,
)
melspec = mel_spectrogram(waveform)
librosa:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
melspec_librosa = librosa.feature.melspectrogram(
signal,
sr=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=False,
pad_mode="reflect",
power=2.0,
n_mels=n_mels,
norm=None,
htk=True,
)
NWaves:
var fftSize = 1024;
var hopSize = 512;
var melCount = 40;
var mfccExtractor = new FilterbankExtractor(
new FilterbankOptions
{
SamplingRate = samplingRate,
FrameSize = fftSize,
FftSize = fftSize,
HopSize = hopSize,
Window = WindowType.Hann,
FilterBank = FilterBanks.Triangular(fftSize, samplingRate, FilterBanks.MelBands(melCount, samplingRate)),
});