Skip to content

Commit

Permalink
[BC-Breaking] Remove compute_kaldi_pitch
Browse files Browse the repository at this point in the history
This commit removes compute_kaldi_pitch function and the underlying
Kaldi integration from torchaudio.

Kaldi pitch function was added in a short period of time by
integrating the original Kaldi implementation, instead of
reimplementing it in PyTorch.

The Kaldi integration employed a hack which replaces the base
vector/matrix implementation of Kaldi with PyTorch Tensor so that
there is only one blas library within torchaudio.

Recently, we are making torchaudio more lean, and we don't see
a wide adoption of kaldi_pitch feature, so we decided to remove them.

See some of the discussion pytorch#1269
  • Loading branch information
mthrok committed May 31, 2023
1 parent f7cb6c6 commit 0102f69
Show file tree
Hide file tree
Showing 23 changed files with 6 additions and 955 deletions.
4 changes: 0 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
[submodule "kaldi"]
path = third_party/kaldi/submodule
url = https://github.com/kaldi-asr/kaldi
ignore = dirty
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ endif()

# Options
option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_RIR "Enable RIR simulation" ON)
option(BUILD_RNNT "Enable RNN transducer" ON)
option(BUILD_ALIGN "Enable forced alignment" ON)
Expand Down
51 changes: 0 additions & 51 deletions examples/tutorials/audio_feature_extractions_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,54 +406,3 @@ def plot_pitch(waveform, sr, pitch):


plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)

######################################################################
# Kaldi Pitch (beta)
# ------------------
#
# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
#
# 1. A pitch extraction algorithm tuned for automatic speech recognition
#
# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
# Khudanpur
#
# 2014 IEEE International Conference on Acoustics, Speech and Signal
# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
# 10.1109/ICASSP.2014.6854049.
# [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
# [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
#

pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]

######################################################################
#

def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
_, axis = plt.subplots(1, 1)
axis.set_title("Kaldi Pitch Feature")
axis.grid(True)

end_time = waveform.shape[1] / sr
time_axis = torch.linspace(0, end_time, waveform.shape[1])
axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)

time_axis = torch.linspace(0, end_time, pitch.shape[1])
ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
axis.set_ylim((-1.3, 1.3))

axis2 = axis.twinx()
time_axis = torch.linspace(0, end_time, nfcc.shape[1])
ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")

lns = ln1 + ln2
labels = [l.get_label() for l in lns]
axis.legend(lns, labels, loc=0)
plt.show(block=False)


plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def _fetch_archives(src):


def _fetch_third_party_libraries():
_init_submodule()
# Revert this when a submodule is added again
# _init_submodule()
if os.name != "nt":
_fetch_archives(_parse_sources())

Expand Down
2 changes: 0 additions & 2 deletions test/torchaudio_unittest/common_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
skipIfNoExec,
skipIfNoFFmpeg,
skipIfNoHWAccel,
skipIfNoKaldi,
skipIfNoMacOS,
skipIfNoModule,
skipIfNoQengine,
Expand Down Expand Up @@ -52,7 +51,6 @@
"skipIfNoExec",
"skipIfNoMacOS",
"skipIfNoModule",
"skipIfNoKaldi",
"skipIfNoRIR",
"skipIfNoSox",
"skipIfNoSoxBackend",
Expand Down
5 changes: 0 additions & 5 deletions test/torchaudio_unittest/common_utils/case_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,6 @@ def skipIfNoModule(module, display_name=None):
reason="Sox features are not available.",
key="NO_SOX",
)
skipIfNoKaldi = _skipIf(
not torchaudio._extension._IS_KALDI_AVAILABLE,
reason="Kaldi features are not available.",
key="NO_KALDI",
)
skipIfNoRIR = _skipIf(
not torchaudio._extension._IS_RIR_AVAILABLE,
reason="RIR features are not available.",
Expand Down
12 changes: 0 additions & 12 deletions test/torchaudio_unittest/functional/batch_consistency_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,18 +257,6 @@ def test_resample_waveform(self, resampling_method):
atol=1e-7,
)

@common_utils.skipIfNoKaldi
def test_compute_kaldi_pitch(self):
sample_rate = 44100
n_channels = 2
waveform = common_utils.get_whitenoise(sample_rate=sample_rate, n_channels=self.batch_size * n_channels)
batch = waveform.view(self.batch_size, n_channels, waveform.size(-1))
kwargs = {
"sample_rate": sample_rate,
}
func = partial(F.compute_kaldi_pitch, **kwargs)
self.assert_batch_consistency(func, inputs=(batch,))

def test_lfilter(self):
signal_length = 2048
x = torch.randn(self.batch_size, signal_length)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import torch
from torchaudio_unittest.common_utils import PytorchTestCase

from .kaldi_compatibility_test_impl import Kaldi, KaldiCPUOnly


class TestKaldiCPUOnly(KaldiCPUOnly, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
from .kaldi_compatibility_test_impl import Kaldi


class TestKaldiFloat32(Kaldi, PytorchTestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
import torch
import torchaudio.functional as F
from parameterized import parameterized
from torchaudio_unittest.common_utils import (
get_sinusoid,
load_params,
save_wav,
skipIfNoExec,
TempDirMixin,
TestBaseMixin,
)
from torchaudio_unittest.common_utils import skipIfNoExec, TempDirMixin, TestBaseMixin
from torchaudio_unittest.common_utils.kaldi_utils import convert_args, run_kaldi


Expand All @@ -32,25 +24,3 @@ def test_sliding_window_cmn(self):
command = ["apply-cmvn-sliding"] + convert_args(**kwargs) + ["ark:-", "ark:-"]
kaldi_result = run_kaldi(command, "ark", tensor)
self.assert_equal(result, expected=kaldi_result)


class KaldiCPUOnly(TempDirMixin, TestBaseMixin):
def assert_equal(self, output, *, expected, rtol=None, atol=None):
expected = expected.to(dtype=self.dtype, device=self.device)
self.assertEqual(output, expected, rtol=rtol, atol=atol)

@parameterized.expand(load_params("kaldi_test_pitch_args.jsonl"))
@skipIfNoExec("compute-kaldi-pitch-feats")
def test_pitch_feats(self, kwargs):
"""compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
sample_rate = kwargs["sample_rate"]
waveform = get_sinusoid(dtype="float32", sample_rate=sample_rate)
result = F.compute_kaldi_pitch(waveform[0], **kwargs)

waveform = get_sinusoid(dtype="int16", sample_rate=sample_rate)
wave_file = self.get_temp_path("test.wav")
save_wav(wave_file, waveform, sample_rate)

command = ["compute-kaldi-pitch-feats"] + convert_args(**kwargs) + ["scp:-", "ark:-"]
kaldi_result = run_kaldi(command, "scp", wave_file)
self.assert_equal(result, expected=kaldi_result)
Original file line number Diff line number Diff line change
Expand Up @@ -585,18 +585,6 @@ def func(tensor):
tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, (tensor,))

@common_utils.skipIfNoKaldi
def test_compute_kaldi_pitch(self):
if self.dtype != torch.float32 or self.device != torch.device("cpu"):
raise unittest.SkipTest("Only float32, cpu is supported.")

def func(tensor):
sample_rate: float = 44100.0
return F.compute_kaldi_pitch(tensor, sample_rate)

tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, (tensor,))

def test_resample_sinc(self):
def func(tensor):
sr1, sr2 = 16000, 8000
Expand Down
7 changes: 0 additions & 7 deletions third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,3 @@ file(MAKE_DIRECTORY install/lib)
if (BUILD_SOX)
add_subdirectory(sox)
endif()

################################################################################
# kaldi
################################################################################
if (BUILD_KALDI)
add_subdirectory(kaldi)
endif()
32 changes: 0 additions & 32 deletions third_party/kaldi/CMakeLists.txt

This file was deleted.

6 changes: 0 additions & 6 deletions third_party/kaldi/README.md

This file was deleted.

76 changes: 0 additions & 76 deletions third_party/kaldi/kaldi.patch

This file was deleted.

39 changes: 0 additions & 39 deletions third_party/kaldi/src/matrix/kaldi-matrix.cc

This file was deleted.

Loading

0 comments on commit 0102f69

Please sign in to comment.