Add Kaldi Pitch feature (#1243)

pytorch · Feb 9, 2021 · 7ee1c46 · 7ee1c46
1 parent 9e58e75
commit 7ee1c46
Show file tree

Hide file tree

Showing 24 changed files with 1,025 additions and 46 deletions.
diff --git a/.circleci/unittest/linux/scripts/run_style_checks.sh b/.circleci/unittest/linux/scripts/run_style_checks.sh
@@ -38,7 +38,7 @@ fi
 
 printf "\x1b[34mRunning clang-format:\x1b[0m\n"
 "${this_dir}"/run_clang_format.py \
-  -r torchaudio/csrc \
+  -r torchaudio/csrc third_party/kaldi/src \
   --clang-format-executable "${clangformat_path}" \
     && git diff --exit-code
 status=$?

diff --git a/.gitmodules b/.gitmodules
@@ -2,3 +2,7 @@
 	path = third_party/transducer/submodule
 	url = https://github.com/HawkAaron/warp-transducer
 	ignore = dirty
+[submodule "kaldi"]
+	path = third_party/kaldi/submodule
+	url = https://github.com/kaldi-asr/kaldi
+	ignore = dirty
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -47,6 +47,7 @@ endif()
 
 # Options
 option(BUILD_SOX "Build libsox statically" OFF)
+option(BUILD_KALDI "Build kaldi statically" ON)
 option(BUILD_TRANSDUCER "Enable transducer" OFF)
 option(BUILD_LIBTORCHAUDIO "Build C++ Library" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)

diff --git a/build_tools/setup_helpers/extension.py b/build_tools/setup_helpers/extension.py
@@ -68,6 +68,7 @@ def build_extension(self, ext):
             '-DCMAKE_VERBOSE_MAKEFILE=ON',
             f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
             f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
+            "-DBUILD_KALDI:BOOL=ON",
             f"-DBUILD_TRANSDUCER:BOOL={'ON' if _BUILD_TRANSDUCER else 'OFF'}",
             "-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",
             "-DBUILD_LIBTORCHAUDIO:BOOL=OFF",

diff --git a/test/torchaudio_unittest/assets/kaldi_test_pitch_args.json b/test/torchaudio_unittest/assets/kaldi_test_pitch_args.json
@@ -0,0 +1,5 @@
+{"sample_rate": 8000}
+{"sample_rate": 8000, "frames_per_chunk": 200}
+{"sample_rate": 8000, "frames_per_chunk": 200, "simulate_first_pass_online": true}
+{"sample_rate": 16000}
+{"sample_rate": 44100}
diff --git a/test/torchaudio_unittest/common_utils/kaldi_utils.py b/test/torchaudio_unittest/common_utils/kaldi_utils.py
@@ -0,0 +1,39 @@
+import subprocess
+
+import torch
+
+
+def convert_args(**kwargs):
+    args = []
+    for key, value in kwargs.items():
+        if key == 'sample_rate':
+            key = 'sample_frequency'
+        key = '--' + key.replace('_', '-')
+        value = str(value).lower() if value in [True, False] else str(value)
+        args.append('%s=%s' % (key, value))
+    return args
+
+
+def run_kaldi(command, input_type, input_value):
+    """Run provided Kaldi command, pass a tensor and get the resulting tensor
+
+    Args:
+        input_type: str
+            'ark' or 'scp'
+        input_value:
+            Tensor for 'ark'
+            string for 'scp' (path to an audio file)
+    """
+    import kaldi_io
+
+    key = 'foo'
+    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    if input_type == 'ark':
+        kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
+    elif input_type == 'scp':
+        process.stdin.write(f'{key} {input_value}'.encode('utf8'))
+    else:
+        raise NotImplementedError('Unexpected type')
+    process.stdin.close()
+    result = dict(kaldi_io.read_mat_ark(process.stdout))['foo']
+    return torch.from_numpy(result.copy())  # copy supresses some torch warning
diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -184,3 +184,9 @@ def test_vad(self):
         waveform, sample_rate = torchaudio.load(filepath)
         self.assert_batch_consistencies(
             F.vad, waveform, sample_rate=sample_rate)
+
+    @common_utils.skipIfNoExtension
+    def test_compute_kaldi_pitch(self):
+        sample_rate = 44100
+        waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
+        self.assert_batch_consistencies(F.compute_kaldi_pitch, waveform, sample_rate=sample_rate)
diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py b/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py
@@ -0,0 +1,9 @@
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .kaldi_compatibility_test_impl import KaldiCPUOnly
+
+
+class TestKaldiCPUOnly(KaldiCPUOnly, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cpu')
diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py b/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py
@@ -0,0 +1,37 @@
+from parameterized import parameterized
+import torchaudio.functional as F
+
+from torchaudio_unittest.common_utils import (
+    get_sinusoid,
+    load_params,
+    save_wav,
+    skipIfNoExec,
+    TempDirMixin,
+    TestBaseMixin,
+)
+from torchaudio_unittest.common_utils.kaldi_utils import (
+    convert_args,
+    run_kaldi,
+)
+
+
+class KaldiCPUOnly(TempDirMixin, TestBaseMixin):
+    def assert_equal(self, output, *, expected, rtol=None, atol=None):
+        expected = expected.to(dtype=self.dtype, device=self.device)
+        self.assertEqual(output, expected, rtol=rtol, atol=atol)
+
+    @parameterized.expand(load_params('kaldi_test_pitch_args.json'))
+    @skipIfNoExec('compute-kaldi-pitch-feats')
+    def test_pitch_feats(self, kwargs):
+        """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
+        sample_rate = kwargs['sample_rate']
+        waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate)
+        result = F.compute_kaldi_pitch(waveform[0], **kwargs)
+
+        waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate)
+        wave_file = self.get_temp_path('test.wav')
+        save_wav(wave_file, waveform, sample_rate)
+
+        command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
+        self.assert_equal(result, expected=kaldi_result)
diff --git a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -547,3 +547,15 @@ def func(tensor):
 
         tensor = common_utils.get_whitenoise(sample_rate=44100)
         self._assert_consistency(func, tensor)
+
+    @common_utils.skipIfNoExtension
+    def test_compute_kaldi_pitch(self):
+        if self.dtype != torch.float32 or self.device != torch.device('cpu'):
+            raise unittest.SkipTest("Only float32, cpu is supported.")
+
+        def func(tensor):
+            sample_rate: float = 44100.
+            return F.compute_kaldi_pitch(tensor, sample_rate)
+
+        tensor = common_utils.get_whitenoise(sample_rate=44100)
+        self._assert_consistency(func, tensor)
diff --git a/test/torchaudio_unittest/kaldi_compatibility_impl.py b/test/torchaudio_unittest/kaldi_compatibility_impl.py
@@ -1,54 +1,24 @@
 """Test suites for checking numerical compatibility against Kaldi"""
-import subprocess
-
-import kaldi_io
 import torch
 import torchaudio.functional as F
 import torchaudio.compliance.kaldi
 from parameterized import parameterized
 
 from torchaudio_unittest.common_utils import (
     TestBaseMixin,
+    TempDirMixin,
     load_params,
     skipIfNoExec,
     get_asset_path,
-    load_wav
+    load_wav,
+)
+from torchaudio_unittest.common_utils.kaldi_utils import (
+    convert_args,
+    run_kaldi,
 )
 
 
-def _convert_args(**kwargs):
-    args = []
-    for key, value in kwargs.items():
-        key = '--' + key.replace('_', '-')
-        value = str(value).lower() if value in [True, False] else str(value)
-        args.append('%s=%s' % (key, value))
-    return args
-
-
-def _run_kaldi(command, input_type, input_value):
-    """Run provided Kaldi command, pass a tensor and get the resulting tensor
-
-    Args:
-        input_type: str
-            'ark' or 'scp'
-        input_value:
-            Tensor for 'ark'
-            string for 'scp' (path to an audio file)
-    """
-    key = 'foo'
-    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    if input_type == 'ark':
-        kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
-    elif input_type == 'scp':
-        process.stdin.write(f'{key} {input_value}'.encode('utf8'))
-    else:
-        raise NotImplementedError('Unexpected type')
-    process.stdin.close()
-    result = dict(kaldi_io.read_mat_ark(process.stdout))['foo']
-    return torch.from_numpy(result.copy())  # copy supresses some torch warning
-
-
-class Kaldi(TestBaseMixin):
+class Kaldi(TempDirMixin, TestBaseMixin):
     def assert_equal(self, output, *, expected, rtol=None, atol=None):
         expected = expected.to(dtype=self.dtype, device=self.device)
         self.assertEqual(output, expected, rtol=rtol, atol=atol)
@@ -65,8 +35,8 @@ def test_sliding_window_cmn(self):
 
         tensor = torch.randn(40, 10, dtype=self.dtype, device=self.device)
         result = F.sliding_window_cmn(tensor, **kwargs)
-        command = ['apply-cmvn-sliding'] + _convert_args(**kwargs) + ['ark:-', 'ark:-']
-        kaldi_result = _run_kaldi(command, 'ark', tensor)
+        command = ['apply-cmvn-sliding'] + convert_args(**kwargs) + ['ark:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'ark', tensor)
         self.assert_equal(result, expected=kaldi_result)
 
     @parameterized.expand(load_params('kaldi_test_fbank_args.json'))
@@ -76,8 +46,8 @@ def test_fbank(self, kwargs):
         wave_file = get_asset_path('kaldi_file.wav')
         waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.fbank(waveform, **kwargs)
-        command = ['compute-fbank-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
-        kaldi_result = _run_kaldi(command, 'scp', wave_file)
+        command = ['compute-fbank-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
         self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
 
     @parameterized.expand(load_params('kaldi_test_spectrogram_args.json'))
@@ -87,8 +57,8 @@ def test_spectrogram(self, kwargs):
         wave_file = get_asset_path('kaldi_file.wav')
         waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.spectrogram(waveform, **kwargs)
-        command = ['compute-spectrogram-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
-        kaldi_result = _run_kaldi(command, 'scp', wave_file)
+        command = ['compute-spectrogram-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
         self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
 
     @parameterized.expand(load_params('kaldi_test_mfcc_args.json'))
@@ -98,6 +68,6 @@ def test_mfcc(self, kwargs):
         wave_file = get_asset_path('kaldi_file.wav')
         waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.mfcc(waveform, **kwargs)
-        command = ['compute-mfcc-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
-        kaldi_result = _run_kaldi(command, 'scp', wave_file)
+        command = ['compute-mfcc-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
         self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
@@ -17,6 +17,14 @@ else()
 endif()
 list(APPEND TORCHAUDIO_THIRD_PARTIES libsox)
 
+################################################################################
+# kaldi
+################################################################################
+if (BUILD_KALDI)
+  add_subdirectory(kaldi)
+  list(APPEND TORCHAUDIO_THIRD_PARTIES kaldi)
+endif()
+
 ################################################################################
 # transducer
 ################################################################################

diff --git a/third_party/kaldi/CMakeLists.txt b/third_party/kaldi/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(KALDI_REPO ${CMAKE_CURRENT_SOURCE_DIR}/submodule)
+
+# Apply custom patch
+execute_process(
+  WORKING_DIRECTORY ${KALDI_REPO}
+  COMMAND "git" "checkout" "."
+  )
+execute_process(
+  WORKING_DIRECTORY ${KALDI_REPO}
+  COMMAND git apply ../kaldi.patch
+  )
+# Update the version string
+execute_process(
+  WORKING_DIRECTORY ${KALDI_REPO}/src/base
+  COMMAND sh get_version.sh
+  )
+
+set(KALDI_SOURCES
+  src/matrix/kaldi-vector.cc
+  src/matrix/kaldi-matrix.cc
+  submodule/src/base/kaldi-error.cc
+  submodule/src/base/kaldi-math.cc
+  submodule/src/feat/feature-functions.cc
+  submodule/src/feat/pitch-functions.cc
+  submodule/src/feat/resample.cc
+  )
+
+add_library(kaldi STATIC ${KALDI_SOURCES})
+target_include_directories(kaldi PUBLIC src submodule/src)
+target_link_libraries(kaldi ${TORCH_LIBRARIES})
diff --git a/third_party/kaldi/README.md b/third_party/kaldi/README.md
@@ -0,0 +1,6 @@
+# Custom Kaldi build
+
+This directory contains original Kaldi repository (as submodule), [the custom implementation of Kaldi's vector/matrix](./src) and the build script.
+
+We use the custom build process so that the resulting library only contains what torchaudio needs.
+We use the custom vector/matrix implementation so that we can use the same BLAS library that PyTorch is compiled with, and so that we can (hopefully, in future) take advantage of other PyTorch features (such as differentiability and GPU support). The down side of this approach is that it adds a lot of overhead compared to the original Kaldi (operator dispatch and element-wise processing, which PyTorch is not efficient at). We can improve this gradually, and if you are interested in helping, please let us know by opening an issue.
diff --git a/third_party/kaldi/kaldi.patch b/third_party/kaldi/kaldi.patch
@@ -0,0 +1,76 @@
+diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
+index 7ebf4f853..c15b288b2 100644
+--- a/src/base/kaldi-types.h
++++ b/src/base/kaldi-types.h
+@@ -41,6 +41,7 @@ typedef float   BaseFloat;
+
+ // for discussion on what to do if you need compile kaldi
+ // without OpenFST, see the bottom of this this file
++/*
+ #include <fst/types.h>
+
+ namespace kaldi {
+@@ -53,10 +54,10 @@ namespace kaldi {
+   typedef float   float32;
+   typedef double double64;
+ }  // end namespace kaldi
++*/
+
+ // In a theoretical case you decide compile Kaldi without the OpenFST
+ // comment the previous namespace statement and uncomment the following
+-/*
+ namespace kaldi {
+   typedef int8_t   int8;
+   typedef int16_t  int16;
+@@ -70,6 +71,5 @@ namespace kaldi {
+   typedef float    float32;
+   typedef double   double64;
+ }  // end namespace kaldi
+-*/
+
+ #endif  // KALDI_BASE_KALDI_TYPES_H_
+diff --git a/src/matrix/matrix-lib.h b/src/matrix/matrix-lib.h
+index b6059b06c..4fb9e1b16 100644
+--- a/src/matrix/matrix-lib.h
++++ b/src/matrix/matrix-lib.h
+@@ -25,14 +25,14 @@
+ #include "base/kaldi-common.h"
+ #include "matrix/kaldi-vector.h"
+ #include "matrix/kaldi-matrix.h"
+-#include "matrix/sp-matrix.h"
+-#include "matrix/tp-matrix.h"
++// #include "matrix/sp-matrix.h"
++// #include "matrix/tp-matrix.h"
+ #include "matrix/matrix-functions.h"
+ #include "matrix/srfft.h"
+ #include "matrix/compressed-matrix.h"
+-#include "matrix/sparse-matrix.h"
++// #include "matrix/sparse-matrix.h"
+ #include "matrix/optimization.h"
+-#include "matrix/numpy-array.h"
++// #include "matrix/numpy-array.h"
+
+ #endif
+
+diff --git a/src/util/common-utils.h b/src/util/common-utils.h
+index cfb0c255c..48d199e97 100644
+--- a/src/util/common-utils.h
++++ b/src/util/common-utils.h
+@@ -21,11 +21,11 @@
+
+ #include "base/kaldi-common.h"
+ #include "util/parse-options.h"
+-#include "util/kaldi-io.h"
+-#include "util/simple-io-funcs.h"
+-#include "util/kaldi-holder.h"
+-#include "util/kaldi-table.h"
+-#include "util/table-types.h"
+-#include "util/text-utils.h"
++// #include "util/kaldi-io.h"
++// #include "util/simple-io-funcs.h"
++// #include "util/kaldi-holder.h"
++// #include "util/kaldi-table.h"
++// #include "util/table-types.h"
++// #include "util/text-utils.h"
+
+ #endif  // KALDI_UTIL_COMMON_UTILS_H_