-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support computing features for whisper (#82)
- Loading branch information
1 parent
7912c2f
commit 01aed93
Showing
22 changed files
with
2,734 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
exclude_files=whisper-mel-bank.h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) | ||
|
||
import librosa | ||
import numpy as np | ||
|
||
|
||
def main(): | ||
m = librosa.filters.mel(sr=16000, n_fft=400, n_mels=80) | ||
assert m.shape == (80, 201) | ||
s = "// Auto-generated. Do NOT edit!\n\n" | ||
s += "// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)\n\n" | ||
s += "\n" | ||
s += "#ifndef KALDIFEAT_CSRC_WHISPER_MEL_BANK_H_\n" | ||
s += "#define KALDIFEAT_CSRC_WHISPER_MEL_BANK_H_\n" | ||
s += "namespace kaldifeat {\n\n" | ||
s += f"constexpr int32_t kWhisperMelRows = {m.shape[0]};\n" | ||
s += f"constexpr int32_t kWhisperMelCols = {m.shape[1]};\n" | ||
s += "\n" | ||
s += "constexpr float kWhisperMelArray[] = {\n" | ||
sep = "" | ||
for i, f in enumerate(m.reshape(-1).tolist()): | ||
s += f"{sep}{f:.8f}" | ||
sep = ", " | ||
if i and i % 7 == 0: | ||
s += ",\n" | ||
sep = "" | ||
|
||
s += "};\n\n" | ||
s += "} // namespace kaldifeat\n\n" | ||
s += "#endif // KALDIFEAT_CSRC_WHISPER_MEL_BANK_H_\n" | ||
|
||
with open("whisper-mel-bank.h", "w") as f: | ||
f.write(s) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/** | ||
* Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) | ||
* | ||
* See LICENSE for clarification regarding multiple authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "kaldifeat/csrc/whisper-fbank.h" | ||
|
||
#include <cmath> | ||
#include <vector> | ||
|
||
#include "kaldifeat/csrc/mel-computations.h" | ||
#include "kaldifeat/csrc/whisper-mel-bank.h" | ||
|
||
#ifndef M_2PI | ||
#define M_2PI 6.283185307179586476925286766559005 | ||
#endif | ||
|
||
namespace kaldifeat { | ||
|
||
WhisperFbankComputer::WhisperFbankComputer(const WhisperFbankOptions &opts) | ||
: opts_(opts), | ||
mel_banks_(kWhisperMelArray, kWhisperMelRows, kWhisperMelCols, | ||
opts.device) { | ||
opts_.frame_opts.samp_freq = 16000; | ||
opts_.frame_opts.frame_shift_ms = 10; | ||
opts_.frame_opts.frame_length_ms = 25; | ||
opts_.frame_opts.dither = 0; | ||
opts_.frame_opts.preemph_coeff = 0; | ||
opts_.frame_opts.remove_dc_offset = false; | ||
opts_.frame_opts.window_type = "hann"; | ||
opts_.frame_opts.round_to_power_of_two = false; | ||
opts_.frame_opts.snip_edges = false; | ||
} | ||
|
||
torch::Tensor WhisperFbankComputer::Compute( | ||
torch::Tensor /*signal_raw_log_energy*/, float /*vtln_warp*/, | ||
const torch::Tensor &signal_frame) { | ||
KALDIFEAT_ASSERT(signal_frame.dim() == 2); | ||
KALDIFEAT_ASSERT(signal_frame.size(1) == opts_.frame_opts.PaddedWindowSize()); | ||
|
||
// note spectrum is in magnitude, not power, because of `abs()` | ||
#if defined(KALDIFEAT_HAS_FFT_NAMESPACE) | ||
// signal_frame shape: [x, 512] | ||
// power shape [x, 257] | ||
torch::Tensor power = torch::fft::rfft(signal_frame).abs().pow(2); | ||
#else | ||
// signal_frame shape [x, 512] | ||
// real_imag shape [x, 257, 2], | ||
// where [..., 0] is the real part | ||
// [..., 1] is the imaginary part | ||
torch::Tensor real_imag = torch::rfft(signal_frame, 1); | ||
torch::Tensor real = real_imag.index({"...", 0}); | ||
torch::Tensor imag = real_imag.index({"...", 1}); | ||
torch::Tensor power = (real.square() + imag.square()); | ||
#endif | ||
|
||
torch::Tensor mel_energies = mel_banks_.Compute(power); | ||
torch::Tensor log_spec = torch::clamp_min(mel_energies, 1e-10).log10(); | ||
log_spec = torch::maximum(log_spec, log_spec.max() - 8.0); | ||
torch::Tensor mel = (log_spec + 4.0) / 4.0; | ||
|
||
return mel; | ||
} | ||
|
||
} // namespace kaldifeat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/** | ||
* Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) | ||
* | ||
* See LICENSE for clarification regarding multiple authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#ifndef KALDIFEAT_CSRC_WHISPER_FBANK_H_ | ||
#define KALDIFEAT_CSRC_WHISPER_FBANK_H_ | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "kaldifeat/csrc/feature-common.h" | ||
#include "kaldifeat/csrc/feature-window.h" | ||
#include "kaldifeat/csrc/mel-computations.h" | ||
|
||
namespace kaldifeat { | ||
|
||
struct WhisperFbankOptions { | ||
FrameExtractionOptions frame_opts; | ||
|
||
torch::Device device{"cpu"}; | ||
std::string ToString() const { | ||
std::ostringstream os; | ||
os << "WhisperFbankOptions("; | ||
os << "frame_opts=" << frame_opts.ToString() << ", "; | ||
os << "device=\"" << device << "\")"; | ||
return os.str(); | ||
} | ||
}; | ||
|
||
class WhisperFbankComputer { | ||
public: | ||
// note: Only frame_opts.device is used. All other fields from frame_opts | ||
// are ignored | ||
explicit WhisperFbankComputer(const WhisperFbankOptions &opts = {}); | ||
|
||
int32_t Dim() const { return 80; } | ||
|
||
const FrameExtractionOptions &GetFrameOptions() const { | ||
return opts_.frame_opts; | ||
} | ||
|
||
const WhisperFbankOptions &GetOptions() const { return opts_; } | ||
|
||
torch::Tensor Compute(torch::Tensor /*signal_raw_log_energy*/, | ||
float /*vtln_warp*/, const torch::Tensor &signal_frame); | ||
|
||
// if true, compute log_energy_pre_window but after dithering and dc removal | ||
bool NeedRawLogEnergy() const { return false; } | ||
using Options = WhisperFbankOptions; | ||
|
||
private: | ||
WhisperFbankOptions opts_; | ||
MelBanks mel_banks_; | ||
}; | ||
|
||
using WhisperFbank = OfflineFeatureTpl<WhisperFbankComputer>; | ||
|
||
} // namespace kaldifeat | ||
|
||
#endif // KALDIFEAT_CSRC_WHISPER_FBANK_H_ |
Oops, something went wrong.