[BC-Breaking] Remove compute_kaldi_pitch

This commit removes compute_kaldi_pitch function and the underlying Kaldi integration from torchaudio. Kaldi pitch function was added in a short period of time by integrating the original Kaldi implementation, instead of reimplementing it in PyTorch. The Kaldi integration employed a hack which replaces the base vector/matrix implementation of Kaldi with PyTorch Tensor so that there is only one blas library within torchaudio. Recently, we are making torchaudio more lean, and we don't see a wide adoption of kaldi_pitch feature, so we decided to remove them. See some of the discussion pytorch#1269
mthrok · Jun 2, 2023 · 0db6e14 · 0db6e14
1 parent b14ced1
commit 0db6e14
Show file tree

Hide file tree

Showing 24 changed files with 7 additions and 956 deletions.
diff --git a/.circleci/unittest/linux/scripts/run_style_checks.sh b/.circleci/unittest/linux/scripts/run_style_checks.sh
@@ -38,7 +38,7 @@ fi
 
 printf "\x1b[34mRunning clang-format:\x1b[0m\n"
 "${this_dir}"/run_clang_format.py \
-  -r torchaudio/csrc third_party/kaldi/src \
+  -r torchaudio/csrc \
   --clang-format-executable "${clangformat_path}" \
     && git diff --exit-code
 status=$?

diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +0,0 @@
-[submodule "kaldi"]
-	path = third_party/kaldi/submodule
-	url = https://github.com/kaldi-asr/kaldi
-	ignore = dirty

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -53,7 +53,6 @@ endif()
 
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
-option(BUILD_KALDI "Build kaldi statically" ON)
 option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_ALIGN "Enable forced alignment" ON)

diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py
@@ -406,54 +406,3 @@ def plot_pitch(waveform, sr, pitch):
 
 
 plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
-
-######################################################################
-# Kaldi Pitch (beta)
-# ------------------
-#
-# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
-# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
-# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
-#
-# 1. A pitch extraction algorithm tuned for automatic speech recognition
-#
-#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
-#    Khudanpur
-#
-#    2014 IEEE International Conference on Acoustics, Speech and Signal
-#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
-#    10.1109/ICASSP.2014.6854049.
-#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
-#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
-#
-
-pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
-pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
-
-######################################################################
-#
-
-def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
-    _, axis = plt.subplots(1, 1)
-    axis.set_title("Kaldi Pitch Feature")
-    axis.grid(True)
-
-    end_time = waveform.shape[1] / sr
-    time_axis = torch.linspace(0, end_time, waveform.shape[1])
-    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
-
-    time_axis = torch.linspace(0, end_time, pitch.shape[1])
-    ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
-    axis.set_ylim((-1.3, 1.3))
-
-    axis2 = axis.twinx()
-    time_axis = torch.linspace(0, end_time, nfcc.shape[1])
-    ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
-
-    lns = ln1 + ln2
-    labels = [l.get_label() for l in lns]
-    axis.legend(lns, labels, loc=0)
-    plt.show(block=False)
-
-
-plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
diff --git a/setup.py b/setup.py
@@ -124,7 +124,8 @@ def _fetch_archives(src):
 
 
 def _fetch_third_party_libraries():
-    _init_submodule()
+    # Revert this when a submodule is added again
+    # _init_submodule()
     if os.name != "nt":
         _fetch_archives(_parse_sources())
 

diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py
@@ -13,7 +13,6 @@
     skipIfNoExec,
     skipIfNoFFmpeg,
     skipIfNoHWAccel,
-    skipIfNoKaldi,
     skipIfNoMacOS,
     skipIfNoModule,
     skipIfNoQengine,
@@ -52,7 +51,6 @@
     "skipIfNoExec",
     "skipIfNoMacOS",
     "skipIfNoModule",
-    "skipIfNoKaldi",
     "skipIfNoRIR",
     "skipIfNoSox",
     "skipIfNoSoxBackend",

diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -234,11 +234,6 @@ def skipIfNoModule(module, display_name=None):
     reason="Sox features are not available.",
     key="NO_SOX",
 )
-skipIfNoKaldi = _skipIf(
-    not torchaudio._extension._IS_KALDI_AVAILABLE,
-    reason="Kaldi features are not available.",
-    key="NO_KALDI",
-)
 skipIfNoRIR = _skipIf(
     not torchaudio._extension._IS_RIR_AVAILABLE,
     reason="RIR features are not available.",

diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -257,18 +257,6 @@ def test_resample_waveform(self, resampling_method):
             atol=1e-7,
         )
 
-    @common_utils.skipIfNoKaldi
-    def test_compute_kaldi_pitch(self):
-        sample_rate = 44100
-        n_channels = 2
-        waveform = common_utils.get_whitenoise(sample_rate=sample_rate, n_channels=self.batch_size * n_channels)
-        batch = waveform.view(self.batch_size, n_channels, waveform.size(-1))
-        kwargs = {
-            "sample_rate": sample_rate,
-        }
-        func = partial(F.compute_kaldi_pitch, **kwargs)
-        self.assert_batch_consistency(func, inputs=(batch,))
-
     def test_lfilter(self):
         signal_length = 2048
         x = torch.randn(self.batch_size, signal_length)

diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py b/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py
@@ -1,12 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .kaldi_compatibility_test_impl import Kaldi, KaldiCPUOnly
-
-
-class TestKaldiCPUOnly(KaldiCPUOnly, PytorchTestCase):
-    dtype = torch.float32
-    device = torch.device("cpu")
+from .kaldi_compatibility_test_impl import Kaldi
 
 
 class TestKaldiFloat32(Kaldi, PytorchTestCase):

diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py b/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py
@@ -1,14 +1,6 @@
 import torch
 import torchaudio.functional as F
-from parameterized import parameterized
-from torchaudio_unittest.common_utils import (
-    get_sinusoid,
-    load_params,
-    save_wav,
-    skipIfNoExec,
-    TempDirMixin,
-    TestBaseMixin,
-)
+from torchaudio_unittest.common_utils import skipIfNoExec, TempDirMixin, TestBaseMixin
 from torchaudio_unittest.common_utils.kaldi_utils import convert_args, run_kaldi
 
 
@@ -32,25 +24,3 @@ def test_sliding_window_cmn(self):
         command = ["apply-cmvn-sliding"] + convert_args(**kwargs) + ["ark:-", "ark:-"]
         kaldi_result = run_kaldi(command, "ark", tensor)
         self.assert_equal(result, expected=kaldi_result)
-
-
-class KaldiCPUOnly(TempDirMixin, TestBaseMixin):
-    def assert_equal(self, output, *, expected, rtol=None, atol=None):
-        expected = expected.to(dtype=self.dtype, device=self.device)
-        self.assertEqual(output, expected, rtol=rtol, atol=atol)
-
-    @parameterized.expand(load_params("kaldi_test_pitch_args.jsonl"))
-    @skipIfNoExec("compute-kaldi-pitch-feats")
-    def test_pitch_feats(self, kwargs):
-        """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
-        sample_rate = kwargs["sample_rate"]
-        waveform = get_sinusoid(dtype="float32", sample_rate=sample_rate)
-        result = F.compute_kaldi_pitch(waveform[0], **kwargs)
-
-        waveform = get_sinusoid(dtype="int16", sample_rate=sample_rate)
-        wave_file = self.get_temp_path("test.wav")
-        save_wav(wave_file, waveform, sample_rate)
-
-        command = ["compute-kaldi-pitch-feats"] + convert_args(**kwargs) + ["scp:-", "ark:-"]
-        kaldi_result = run_kaldi(command, "scp", wave_file)
-        self.assert_equal(result, expected=kaldi_result)
diff --git a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -585,18 +585,6 @@ def func(tensor):
         tensor = common_utils.get_whitenoise(sample_rate=44100)
         self._assert_consistency(func, (tensor,))
 
-    @common_utils.skipIfNoKaldi
-    def test_compute_kaldi_pitch(self):
-        if self.dtype != torch.float32 or self.device != torch.device("cpu"):
-            raise unittest.SkipTest("Only float32, cpu is supported.")
-
-        def func(tensor):
-            sample_rate: float = 44100.0
-            return F.compute_kaldi_pitch(tensor, sample_rate)
-
-        tensor = common_utils.get_whitenoise(sample_rate=44100)
-        self._assert_consistency(func, (tensor,))
-
     def test_resample_sinc(self):
         def func(tensor):
             sr1, sr2 = 16000, 8000

diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
@@ -9,10 +9,3 @@ file(MAKE_DIRECTORY install/lib)
 if (BUILD_SOX)
   add_subdirectory(sox)
 endif()
-
-################################################################################
-# kaldi
-################################################################################
-if (BUILD_KALDI)
-  add_subdirectory(kaldi)
-endif()
diff --git a/third_party/kaldi/CMakeLists.txt b/third_party/kaldi/CMakeLists.txt
diff --git a/third_party/kaldi/README.md b/third_party/kaldi/README.md
diff --git a/third_party/kaldi/kaldi.patch b/third_party/kaldi/kaldi.patch
diff --git a/third_party/kaldi/src/matrix/kaldi-matrix.cc b/third_party/kaldi/src/matrix/kaldi-matrix.cc