Skip to content

Commit

Permalink
Pascal API for VAD (#1249)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Aug 13, 2024
1 parent a7dc6c2 commit 619279b
Show file tree
Hide file tree
Showing 24 changed files with 1,199 additions and 14 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,54 @@ jobs:
cp -v install/lib/*.dll ../pascal-api-examples/read-wav
cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
cp -v install/lib/*.dll ../pascal-api-examples/vad
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
fi
- name: Run Pascal test (VAD + non-streaming ASR)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad-with-non-streaming-asr
time ./run-vad-with-whisper.sh
rm -rf sherpa-onnx-*
echo "---"
time ./run-vad-with-sense-voice.sh
rm -rf sherpa-onnx-*
echo "---"
ls -lh
popd
- name: Run Pascal test (VAD test)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad
./run-circular-buffer.sh
echo "---"
time ./run-remove-silence.sh
echo "---"
ls -lh
popd
- name: Run Pascal test (Read wav test)
shell: bash
run: |
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|[read-wav](./read-wav)|It shows how to read a wave file.|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/nemo_ctc.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/nemo_transducer.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/paraformer.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/paraformer_itn.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/sense_voice.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
Config.ModelConfig.SenseVoice.Language := 'auto';
Config.ModelConfig.SenseVoice.UseItn := False;
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/telespeech_ctc.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Expand Down
2 changes: 2 additions & 0 deletions pascal-api-examples/non-streaming-asr/whisper.pas
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);

Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
Expand Down
3 changes: 3 additions & 0 deletions pascal-api-examples/vad-with-non-streaming-asr/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
!run-*.sh
vad_with_whisper
vad_with_sense_voice
12 changes: 12 additions & 0 deletions pascal-api-examples/vad-with-non-streaming-asr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Introduction


This directory contains examples for how to use the VAD (voice activity detection)
with non-streaming speech recognition models.

|Directory| Description|
|---------|------------|
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|

Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi

if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi

if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
fi

fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_sense_voice.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./vad_with_sense_voice
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi

if [ ! -f ./Obama.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
fi

if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2

tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
fi

fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_whisper.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./vad_with_whisper
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{ Copyright (c) 2024 Xiaomi Corporation }

{
This file shows how to use a non-streaming SenseVoice model
with silero VAD to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program vad_with_whisper;

{$mode objfpc}

uses
sherpa_onnx,
SysUtils;

function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;

SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);

SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}

Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;

Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;

function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);

Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
Config.ModelConfig.SenseVoice.Language := 'auto';
Config.ModelConfig.SenseVoice.UseItn := False;
Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;

var
Wave: TSherpaOnnxWave;

Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;

Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;

Start: Single;
Duration: Single;

Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();

Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));

Exit;
end;

WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;

while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();

Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);

Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));

FreeAndNil(Stream);
end;
end;

Vad.Flush;

while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();

Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);

Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));

FreeAndNil(Stream);
end;

FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.
Loading

0 comments on commit 619279b

Please sign in to comment.