Pascal API for VAD (#1249)

k2-fsa · Aug 13, 2024 · 619279b · 619279b
1 parent a7dc6c2
commit 619279b
Show file tree

Hide file tree

Showing 24 changed files with 1,199 additions and 14 deletions.
diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml
@@ -116,12 +116,54 @@ jobs:
             cp -v install/lib/*.dll ../pascal-api-examples/read-wav
             cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
             cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
+            cp -v install/lib/*.dll ../pascal-api-examples/vad
+            cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
 
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
+            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
           fi
 
+      - name:  Run Pascal test (VAD + non-streaming ASR)
+        shell: bash
+        run: |
+          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
+
+          cd ./pascal-api-examples
+
+          pushd vad-with-non-streaming-asr
+          time ./run-vad-with-whisper.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          time ./run-vad-with-sense-voice.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ls -lh
+
+          popd
+
+      - name:  Run Pascal test (VAD test)
+        shell: bash
+        run: |
+          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
+
+          cd ./pascal-api-examples
+
+          pushd vad
+          ./run-circular-buffer.sh
+          echo "---"
+
+          time ./run-remove-silence.sh
+          echo "---"
+
+          ls -lh
+
+          popd
+
       - name:  Run Pascal test (Read wav test)
         shell: bash
         run: |

diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md
@@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
 |[read-wav](./read-wav)|It shows how to read a wave file.|
 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
+|[vad](./vad)| It shows how to use the voice activity detection API.|
+|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
diff --git a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';

diff --git a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
   Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
   Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';

diff --git a/pascal-api-examples/non-streaming-asr/paraformer.pas b/pascal-api-examples/non-streaming-asr/paraformer.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';

diff --git a/pascal-api-examples/non-streaming-asr/paraformer_itn.pas b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';

diff --git a/pascal-api-examples/non-streaming-asr/sense_voice.pas b/pascal-api-examples/non-streaming-asr/sense_voice.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
   Config.ModelConfig.SenseVoice.Language := 'auto';
   Config.ModelConfig.SenseVoice.UseItn := False;

diff --git a/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';

diff --git a/pascal-api-examples/non-streaming-asr/whisper.pas b/pascal-api-examples/non-streaming-asr/whisper.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
   Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';

diff --git a/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas
@@ -33,6 +33,8 @@
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
   Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
   Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';

diff --git a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore
@@ -0,0 +1,3 @@
+!run-*.sh
+vad_with_whisper
+vad_with_sense_voice
diff --git a/pascal-api-examples/vad-with-non-streaming-asr/README.md b/pascal-api-examples/vad-with-non-streaming-asr/README.md
@@ -0,0 +1,12 @@
+# Introduction
+
+
+This directory contains examples for how to use the VAD (voice activity detection)
+with non-streaming speech recognition models.
+
+|Directory| Description|
+|---------|------------|
+|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
+|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
+
+Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./lei-jun-test.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+fi
+
+if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./vad_with_sense_voice.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./vad_with_sense_voice
diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./Obama.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+fi
+
+if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+
+  tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+  rm sherpa-onnx-whisper-tiny.en.tar.bz2
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./vad_with_whisper.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./vad_with_whisper
diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas
@@ -0,0 +1,137 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming SenseVoice model
+with silero VAD to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program vad_with_whisper;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  SysUtils;
+
+function CreateVad(): TSherpaOnnxVoiceActivityDetector;
+var
+  Config: TSherpaOnnxVadModelConfig;
+
+  SampleRate: Integer;
+  WindowSize: Integer;
+begin
+  Initialize(Config);
+
+  SampleRate := 16000; {Please don't change it unless you know the details}
+  WindowSize := 512; {Please don't change it unless you know the details}
+
+  Config.SileroVad.Model := './silero_vad.onnx';
+  Config.SileroVad.MinSpeechDuration := 0.5;
+  Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.Threshold := 0.5;
+  Config.SileroVad.WindowSize := WindowSize;
+  Config.NumThreads:= 1;
+  Config.Debug:= True;
+  Config.Provider:= 'cpu';
+  Config.SampleRate := SampleRate;
+
+  Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
+end;
+
+function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
+var
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+begin
+  Initialize(Config);
+
+  Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
+  Config.ModelConfig.SenseVoice.Language := 'auto';
+  Config.ModelConfig.SenseVoice.UseItn := False;
+  Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  Result := TSherpaOnnxOfflineRecognizer.Create(Config);
+end;
+
+var
+  Wave: TSherpaOnnxWave;
+
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Vad: TSherpaOnnxVoiceActivityDetector;
+
+  Offset: Integer;
+  WindowSize: Integer;
+  SpeechSegment: TSherpaOnnxSpeechSegment;
+
+  Start: Single;
+  Duration: Single;
+
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+begin
+  Vad := CreateVad();
+  Recognizer := CreateOfflineRecognizer();
+
+  Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
+  if Wave.SampleRate <> Vad.Config.SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d. Given: %d',
+        [Vad.Config.SampleRate, Wave.SampleRate]));
+
+      Exit;
+    end;
+
+  WindowSize := Vad.Config.SileroVad.WindowSize;
+  Offset := 0;
+  while Offset + WindowSize <= Length(Wave.Samples) do
+    begin
+      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+      Offset += WindowSize;
+
+      while not Vad.IsEmpty do
+        begin
+          SpeechSegment := Vad.Front();
+          Vad.Pop();
+          Stream := Recognizer.CreateStream();
+
+          Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+          Recognizer.Decode(Stream);
+          RecognitionResult := Recognizer.GetResult(Stream);
+
+          Start := SpeechSegment.Start / Wave.SampleRate;
+          Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+          WriteLn(Format('%.3f -- %.3f %s',
+            [Start, Start + Duration, RecognitionResult.Text]));
+
+          FreeAndNil(Stream);
+        end;
+    end;
+
+  Vad.Flush;
+
+  while not Vad.IsEmpty do
+    begin
+      SpeechSegment := Vad.Front();
+      Vad.Pop();
+      Stream := Recognizer.CreateStream();
+
+      Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+      Recognizer.Decode(Stream);
+      RecognitionResult := Recognizer.GetResult(Stream);
+
+      Start := SpeechSegment.Start / Wave.SampleRate;
+      Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+      WriteLn(Format('%.3f -- %.3f %s',
+        [Start, Start + Duration, RecognitionResult.Text]));
+
+      FreeAndNil(Stream);
+    end;
+
+  FreeAndNil(Recognizer);
+  FreeAndNil(Vad);
+end.