Skip to content

MFCC and Mel Spectrogram

Tim Sharii edited this page Nov 22, 2021 · 8 revisions

MFCC

NWaves and librosa / PyTorch

Scheme:

  1. Spectrum: power spectrum
  2. Mel Filterbank: HTK-like (default in PyTorch) or Slaney (default in librosa)
  3. Nonlinearity: 10*Log10()
  4. DCT: 2N (default)

PyTorch:

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 24
n_mfcc = 13

mfcc_transform = T.MFCC(
    sample_rate=sample_rate,
    n_mfcc=n_mfcc,
    melkwargs={
      'n_fft': n_fft,
      'n_mels': n_mels,
      'hop_length': hop_length
    }
)

mfcc = mfcc_transform(signal)

librosa:

# unfortunately, you'll need to call two methods separately
# (because of two different meanings of the 'norm' parameter)

melspec = librosa.feature.melspectrogram(
  y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
  win_length=win_length, hop_length=hop_length,
  n_mels=n_mels, htk=True, norm=None)

mfcc_librosa = librosa.feature.mfcc(
  S=librosa.core.spectrum.power_to_db(melspec),
  n_mfcc=n_mfcc, dct_type=2, norm='ortho')

NWaves:

var fftSize = 1024;
var melCount = 24;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBankSize = melCount,
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

If your calculations are based on method librosa.feature.mfcc like this:

librosa.feature.mfcc(
    signal,
    sample_rate,
    n_mfcc=13,
    dct_type=2,
    norm='ortho',
    window='hann',
    htk=True,
    n_mels=24,
    n_fft=1024,
    hop_length=512,
    center=False)

and you want a full compliance with it, you can specifiy the following settings in NWaves:

var fftSize = 1024;
var melCount = 24;

var freqs = FilterBanks.MelBands(melCount, samplingRate);
var fbank = FilterBanks.Triangular(fftSize, samplingRate, freqs);
FilterBanks.Normalize(melCount, freqs, fbank);

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBank = fbank,
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

NWaves and librosa only (Slaney mel-filterbank)

librosa (htk=False, i.e. Slaney mel-filterbank):

librosa.feature.mfcc(
    signal,
    sample_rate,
    n_mfcc=13,
    dct_type=2,
    norm='ortho',
    window='hann',
    htk=False,
    n_mels=40,
    n_fft=1024,
    hop_length=512,
    center=False)

NWaves:

var fftSize = 1024;
var melCount = 40;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 13,
    FrameSize = fftSize,
    HopSize = 512,
    FilterBank = FilterBanks.MelBankSlaney(melCount, fftSize, samplingRate),
    SpectrumType = SpectrumType.Power,
    NonLinearity = NonLinearityType.ToDecibel,
    DctType = "2N",
    FftSize = fftSize,
    Window = WindowType.Hann,
    LogFloor = 1e-10f
};

NWaves and Kaldi / PyTorch.kaldi.compliance

Scheme:

  1. Spectrum: power spectrum
  2. Mel Filterbank: HTK-like
  3. Nonlinearity: Ln()
  4. DCT: 2N
  5. and a lot of other parameters
m = torchaudio.compliance.kaldi.mfcc(waveform, frame_length=25, window_type='blackman', preemphasis_coefficient=0.97, remove_dc_offset=False)

NWaves:

// TODO

NWaves and python_speech_features

Scheme:

  1. Spectrum: power spectrum (normalized)
  2. Mel Filterbank: essentially HTK-like (but slightly different)
  3. Nonlinearity: Ln()
  4. DCT: 2N
# s = s / 32768   # if samples are not in range [-1, 1]
                  # (or don't normalize signal in NWaves)

mfcc(
    signal,
    samplerate=16000,
    winlen=0.025,
    winstep=0.01,
    numcep=16,
    nfilt=26,
    nfft=512,
    lowfreq=0,
    highfreq=None,
    preemph=0.97,
    ceplifter=22,
    appendEnergy=False,
    winfunc=numpy.hanning
)

NWaves:

var samplingRate = 16000;
var fftSize = 512;
var melCount = 26;

var mfccOptions = new MfccOptions
{
    SamplingRate = samplingRate,
    FeatureCount = 16,
    FrameDuration = 0.025,
    HopDuration = 0.01,
    FilterBankSize = melCount,
    SpectrumType = SpectrumType.PowerNormalized,
    NonLinearity = NonLinearityType.LogE,
    DctType = "2N",
    FftSize = fftSize,
    PreEmphasis = 0.97,
    LifterSize = 22,
    IncludeEnergy = false,
    Window = WindowType.Hann,
};

For complete compliance with python_speech_features:

  1. Correction for the first coefficient:
  • if appendEnergy=False (IncludeEnergy = false)
// call this on already computed mfccVectors:

for (var i = 0; i < mfccVectors.Count; i++)
{
    mfccVectors[i][0] -= (float)(Math.Log(2) * Math.Sqrt(melCount));
}
  • if appendEnergy=True (IncludeEnergy = true)
// call this on already computed mfccVectors:

for (var i = 0; i < mfccVectors.Count; i++)
{
    mfccVectors[i][0] -= (float)Math.Log(2);
}
  1. Set the filterbank explicitly (instead of FilterBankSize = melCount)

FilterBank = PsfFilterbank(samplingRate, melCount, fftSize).

The code of PsfFilterbank function:

/// <summary>
/// Generates filterbank with weights identical to python_speech_features.
/// </summary>
float[][] PsfFilterbank(int samplingRate, int filterbankSize, int fftSize, double lowFreq = 0, double highFreq = 0)
{
    var filterbank = new float[filterbankSize][];

    if (highFreq <= lowFreq)
    {
        highFreq = samplingRate / 2;
    }

    var low = NWaves.Utils.Scale.HerzToMel(lowFreq);
    var high = NWaves.Utils.Scale.HerzToMel(highFreq);

    var res = (fftSize + 1) / (float)samplingRate;

    var bins = Enumerable
                  .Range(0, filterbankSize + 2)
                  .Select(i => (float)Math.Floor(res * NWaves.Utils.Scale.MelToHerz(i * (high - low) / (filterbankSize + 1))))
                  .ToArray();

    for (var i = 0; i < filterbankSize; i++)
    {
        filterbank[i] = new float[fftSize / 2 + 1];

        for (var j = (int)bins[i]; j < (int)bins[i + 1]; j++)
        {
            filterbank[i][j] = (j - bins[i]) / (bins[i + 1] - bins[i]);
        }
        for (var j = (int)bins[i + 1]; j < (int)bins[i + 2]; j++)
        {
            filterbank[i][j] = (bins[i + 2] - j) / (bins[i + 2] - bins[i + 1]);
        }
    }

    return filterbank;
}

Mel-Spectrogram

pyTorch:

import torchaudio.transforms as T

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40

mel_spectrogram = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    power=2.0,
    n_mels=n_mels,
)

melspec = mel_spectrogram(waveform)

librosa:

n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40

melspec_librosa = librosa.feature.melspectrogram(
    signal,
    sr=sample_rate,
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    center=False,
    power=2.0,
    n_mels=n_mels,
    norm=None,
    htk=True,
)

NWaves:

var fftSize = 1024;
var hopSize = 512;
var melCount = 40;

var mfccExtractor = new FilterbankExtractor(
   new FilterbankOptions
   {
       SamplingRate = samplingRate,
       FrameSize = fftSize,
       FftSize = fftSize,
       HopSize = hopSize,
       Window = WindowType.Hann,
       FilterBank = FilterBanks.Triangular(fftSize, samplingRate, FilterBanks.MelBands(melCount, samplingRate)),

       // if power = 1.0
       // SpectrumType = SpectrumType.Magnitude
   });
Clone this wiki locally