-
Notifications
You must be signed in to change notification settings - Fork 75
MFCC and Mel Spectrogram
Tim Sharii edited this page Nov 22, 2021
·
8 revisions
Scheme:
- Spectrum: power spectrum
- Mel Filterbank: HTK-like (default in PyTorch) or Slaney (default in librosa)
- Nonlinearity: 10*Log10()
- DCT: 2N (default)
PyTorch:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 24
n_mfcc = 13
mfcc_transform = T.MFCC(
sample_rate=sample_rate,
n_mfcc=n_mfcc,
melkwargs={
'n_fft': n_fft,
'n_mels': n_mels,
'hop_length': hop_length
}
)
mfcc = mfcc_transform(signal)
librosa:
# unfortunately, you'll need to call two methods separately
# (because of two different meanings of the 'norm' parameter)
melspec = librosa.feature.melspectrogram(
y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
win_length=win_length, hop_length=hop_length,
n_mels=n_mels, htk=True, norm=None)
mfcc_librosa = librosa.feature.mfcc(
S=librosa.core.spectrum.power_to_db(melspec),
n_mfcc=n_mfcc, dct_type=2, norm='ortho')
NWaves:
var fftSize = 1024;
var melCount = 24;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBankSize = melCount,
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
If your calculations are based on method librosa.feature.mfcc
like this:
librosa.feature.mfcc(
signal,
sample_rate,
n_mfcc=13,
dct_type=2,
norm='ortho',
window='hann',
htk=True,
n_mels=24,
n_fft=1024,
hop_length=512,
center=False)
and you want a full compliance with it, you can specifiy the following settings in NWaves:
var fftSize = 1024;
var melCount = 24;
var freqs = FilterBanks.MelBands(melCount, samplingRate);
var fbank = FilterBanks.Triangular(fftSize, samplingRate, freqs);
FilterBanks.Normalize(melCount, freqs, fbank);
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBank = fbank,
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
librosa (htk=False
, i.e. Slaney mel-filterbank):
librosa.feature.mfcc(
signal,
sample_rate,
n_mfcc=13,
dct_type=2,
norm='ortho',
window='hann',
htk=False,
n_mels=40,
n_fft=1024,
hop_length=512,
center=False)
NWaves:
var fftSize = 1024;
var melCount = 40;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBank = FilterBanks.MelBankSlaney(melCount, fftSize, samplingRate),
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
Scheme:
- Spectrum: power spectrum
- Mel Filterbank: HTK-like
- Nonlinearity: Ln()
- DCT: 2N
- and a lot of other parameters
m = torchaudio.compliance.kaldi.mfcc(waveform, frame_length=25, window_type='blackman', preemphasis_coefficient=0.97, remove_dc_offset=False)
NWaves:
// TODO
Scheme:
- Spectrum: power spectrum (normalized)
- Mel Filterbank: essentially HTK-like (but slightly different)
- Nonlinearity: Ln()
- DCT: 2N
# s = s / 32768 # if samples are not in range [-1, 1]
# (or don't normalize signal in NWaves)
mfcc(
signal,
samplerate=16000,
winlen=0.025,
winstep=0.01,
numcep=16,
nfilt=26,
nfft=512,
lowfreq=0,
highfreq=None,
preemph=0.97,
ceplifter=22,
appendEnergy=False,
winfunc=numpy.hanning
)
NWaves:
var samplingRate = 16000;
var fftSize = 512;
var melCount = 26;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 16,
FrameDuration = 0.025,
HopDuration = 0.01,
FilterBankSize = melCount,
SpectrumType = SpectrumType.PowerNormalized,
NonLinearity = NonLinearityType.LogE,
DctType = "2N",
FftSize = fftSize,
PreEmphasis = 0.97,
LifterSize = 22,
IncludeEnergy = false,
Window = WindowType.Hann,
};
- Correction for the first coefficient:
- if
appendEnergy=False (IncludeEnergy = false)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)(Math.Log(2) * Math.Sqrt(melCount));
}
- if
appendEnergy=True (IncludeEnergy = true)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)Math.Log(2);
}
- Set the filterbank explicitly (instead of
FilterBankSize = melCount
)
FilterBank = PsfFilterbank(samplingRate, melCount, fftSize)
.
The code of PsfFilterbank
function:
/// <summary>
/// Generates filterbank with weights identical to python_speech_features.
/// </summary>
float[][] PsfFilterbank(int samplingRate, int filterbankSize, int fftSize, double lowFreq = 0, double highFreq = 0)
{
var filterbank = new float[filterbankSize][];
if (highFreq <= lowFreq)
{
highFreq = samplingRate / 2;
}
var low = NWaves.Utils.Scale.HerzToMel(lowFreq);
var high = NWaves.Utils.Scale.HerzToMel(highFreq);
var res = (fftSize + 1) / (float)samplingRate;
var bins = Enumerable
.Range(0, filterbankSize + 2)
.Select(i => (float)Math.Floor(res * NWaves.Utils.Scale.MelToHerz(i * (high - low) / (filterbankSize + 1))))
.ToArray();
for (var i = 0; i < filterbankSize; i++)
{
filterbank[i] = new float[fftSize / 2 + 1];
for (var j = (int)bins[i]; j < (int)bins[i + 1]; j++)
{
filterbank[i][j] = (j - bins[i]) / (bins[i + 1] - bins[i]);
}
for (var j = (int)bins[i + 1]; j < (int)bins[i + 2]; j++)
{
filterbank[i][j] = (bins[i + 2] - j) / (bins[i + 2] - bins[i + 1]);
}
}
return filterbank;
}
pyTorch:
import torchaudio.transforms as T
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
mel_spectrogram = T.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
power=2.0,
n_mels=n_mels,
)
melspec = mel_spectrogram(waveform)
librosa:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
melspec_librosa = librosa.feature.melspectrogram(
signal,
sr=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=False,
power=2.0,
n_mels=n_mels,
norm=None,
htk=True,
)
NWaves:
var fftSize = 1024;
var hopSize = 512;
var melCount = 40;
var mfccExtractor = new FilterbankExtractor(
new FilterbankOptions
{
SamplingRate = samplingRate,
FrameSize = fftSize,
FftSize = fftSize,
HopSize = hopSize,
Window = WindowType.Hann,
FilterBank = FilterBanks.Triangular(fftSize, samplingRate, FilterBanks.MelBands(melCount, samplingRate)),
// if power = 1.0
// SpectrumType = SpectrumType.Magnitude
});