Add VAD + Non-streaming ASR example for JavaScript API. (#1170)

k2-fsa · Jul 26, 2024 · 994c3e7 · 994c3e7
1 parent 299f1a8
commit 994c3e7
Show file tree

Hide file tree

Showing 22 changed files with 189 additions and 32 deletions.
diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh
@@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()")
 platform=$(node -p "require('os').platform()")
 node_version=$(node -p "process.versions.node.split('.')[0]")
 
+echo "----------non-streaming asr + vad----------"
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+rm sherpa-onnx-whisper-tiny.en.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test_vad_with_non_streaming_asr_whisper.js
+rm -rf sherpa-onnx-whisper*
+rm *.wav
+rm *.onnx
+
 echo "----------asr----------"
 
 if [[ $arch != "ia32" && $platform != "win32" ]]; then

diff --git a/.gitignore b/.gitignore
@@ -112,3 +112,4 @@ sherpa-onnx-telespeech-ctc-*
 .ccache
 lib*.a
 sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+*.bak
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 1.10.18
+
+* Fix the case when recognition results contain the symbol `"`. It caused
+  issues when converting results to a json string.
+
 ## 1.10.17
 
 * Support SenseVoice CTC models.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,7 +11,7 @@ project(sherpa-onnx)
 # ./nodejs-addon-examples
 # ./dart-api-examples/
 # ./CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.17")
+set(SHERPA_ONNX_VERSION "1.10.18")
 
 # Disable warning about
 #

diff --git a/dart-api-examples/keyword-spotter/pubspec.yaml b/dart-api-examples/keyword-spotter/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   # sherpa_onnx:
   #   path: ../../flutter/sherpa_onnx
   path: ^1.9.0

diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none'
 
-version: 1.10.17
+version: 1.10.18
 
 topics:
   - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
   record: ^5.1.0
   url_launcher: ^6.2.6
 
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   # sherpa_onnx:
     # path: ../../flutter/sherpa_onnx
 

diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none' # Remove this line if you wish to publish to pub.dev
 
-version: 1.10.17
+version: 1.10.18
 
 environment:
   sdk: '>=3.4.0 <4.0.0'
@@ -17,7 +17,7 @@ dependencies:
   cupertino_icons: ^1.0.6
   path_provider: ^2.1.3
   path: ^1.9.0
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
   url_launcher: ^6.2.6
   audioplayers: ^5.0.0
 

diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.17
+version: 1.10.18
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
@@ -30,23 +30,23 @@ dependencies:
   flutter:
     sdk: flutter
 
-  sherpa_onnx_android: ^1.10.17
+  sherpa_onnx_android: ^1.10.18
   # sherpa_onnx_android:
   #   path: ../sherpa_onnx_android
 
-  sherpa_onnx_macos: ^1.10.17
+  sherpa_onnx_macos: ^1.10.18
   # sherpa_onnx_macos:
   #   path: ../sherpa_onnx_macos
 
-  sherpa_onnx_linux: ^1.10.17
+  sherpa_onnx_linux: ^1.10.18
   # sherpa_onnx_linux:
   #   path: ../sherpa_onnx_linux
     #
-  sherpa_onnx_windows: ^1.10.17
+  sherpa_onnx_windows: ^1.10.18
   # sherpa_onnx_windows:
   #   path: ../sherpa_onnx_windows
 
-  sherpa_onnx_ios: ^1.10.17
+  sherpa_onnx_ios: ^1.10.18
   # sherpa_onnx_ios:
   #   path: ../sherpa_onnx_ios
 

diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.17'
+  s.version          = '1.10.18'
   s.summary          = 'A new Flutter FFI plugin project.'
   s.description      = <<-DESC
 A new Flutter FFI plugin project.

diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.17'
+  s.version          = '1.10.18'
   s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
   s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.

diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md
@@ -93,6 +93,7 @@ The following tables list the examples in this folder.
 |---|---|
 |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
+|[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -221,11 +222,24 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2
 
 node ./test_asr_non_streaming_whisper.js
 
-# To run VAD + non-streaming ASR with Paraformer using a microphone
+# To run VAD + non-streaming ASR with Whisper using a microphone
 npm install naudiodon2
 node ./test_vad_asr_non_streaming_whisper_microphone.js
 ```
 
+### Non-streaming speech recognition with Whisper + VAD
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+rm sherpa-onnx-whisper-tiny.en.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test_vad_with_non_streaming_asr_whisper.js
+```
+
 ### Non-streaming speech recognition with NeMo CTC models
 
 ```bash

diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
   "dependencies": {
-    "sherpa-onnx-node": "^1.10.17"
+    "sherpa-onnx-node": "^1.10.18"
   }
 }
diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
@@ -0,0 +1,127 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createRecognizer() {
+  // Please download test files from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'whisper': {
+        'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
+        'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
+      },
+      'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    }
+  };
+
+  return new sherpa_onnx.OfflineRecognizer(config);
+}
+
+function createVad() {
+  // please download silero_vad.onnx from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+  const config = {
+    sileroVad: {
+      model: './silero_vad.onnx',
+      threshold: 0.5,
+      minSpeechDuration: 0.25,
+      minSilenceDuration: 0.5,
+      windowSize: 512,
+    },
+    sampleRate: 16000,
+    debug: true,
+    numThreads: 1,
+  };
+
+  const bufferSizeInSeconds = 60;
+
+  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
+}
+
+const recognizer = createRecognizer();
+const vad = createVad();
+
+// please download ./Obama.wav from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const waveFilename = './Obama.wav';
+const wave = sherpa_onnx.readWave(waveFilename);
+
+if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
+  throw new Error(
+      'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
+}
+
+console.log('Started')
+let start = Date.now();
+
+const windowSize = vad.config.sileroVad.windowSize;
+for (let i = 0; i < wave.samples.length; i += windowSize) {
+  const thisWindow = wave.samples.subarray(i, i + windowSize);
+  vad.acceptWaveform(thisWindow);
+
+  while (!vad.isEmpty()) {
+    const segment = vad.front();
+    vad.pop();
+
+    let start_time = segment.start / wave.sampleRate;
+    let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+    start_time = start_time.toFixed(2);
+    end_time = end_time.toFixed(2);
+
+    const stream = recognizer.createStream();
+    stream.acceptWaveform(
+        {samples: segment.samples, sampleRate: wave.sampleRate});
+
+    recognizer.decode(stream);
+    const r = recognizer.getResult(stream);
+    if (r.text.length > 0) {
+      const text = r.text.toLowerCase().trim();
+      console.log(`${start_time} -- ${end_time}: ${text}`);
+    }
+  }
+}
+
+vad.flush();
+
+while (!vad.isEmpty()) {
+  const segment = vad.front();
+  vad.pop();
+
+  let start_time = segment.start / wave.sampleRate;
+  let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+  start_time = start_time.toFixed(2);
+  end_time = end_time.toFixed(2);
+
+  const stream = recognizer.createStream();
+  stream.acceptWaveform(
+      {samples: segment.samples, sampleRate: wave.sampleRate});
+
+  recognizer.decode(stream);
+  const r = recognizer.getResult(stream);
+  if (r.text.length > 0) {
+    const text = r.text.toLowerCase().trim();
+    console.log(`${start_time} -- ${end_time}: ${text}`);
+  }
+}
+
+let stop = Date.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
diff --git a/scripts/dart/kws-pubspec.yaml b/scripts/dart/kws-pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  # sherpa_onnx: ^1.10.17
+  # sherpa_onnx: ^1.10.18
   sherpa_onnx:
     path: ../../flutter/sherpa_onnx
   path: ^1.9.0

diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.17
+version: 1.10.18
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 

diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js
@@ -65,7 +65,7 @@ config = {
   }
 
   clear() {
-    addon.VoiceActivityDetectorClearWrapper(this.handle);
+    addon.voiceActivityDetectorClear(this.handle);
   }
 
   /*
@@ -79,11 +79,11 @@ config = {
   }
 
   reset() {
-    addon.VoiceActivityDetectorResetWrapper(this.handle);
+    addon.voiceActivityDetectorReset(this.handle);
   }
 
   flush() {
-    addon.VoiceActivityDetectorFlushWrapper(this.handle);
+    addon.voiceActivityDetectorFlush(this.handle);
   }
 }