From c3260ef842e2b6be23460e0a51130dde644c2161 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Sun, 21 Jul 2024 10:14:14 +0800
Subject: [PATCH] Add JavaScript API for SenseVoice (#1157)

---
 .github/scripts/test-nodejs-addon-npm.sh      |   7 ++
 nodejs-addon-examples/README.md               |  16 +++
 nodejs-addon-examples/package.json            |   2 +-
 .../test_asr_non_streaming_sense_voice.js     |  48 ++++++++
 ...sr_non_streaming_sense_voice_microphone.js | 111 ++++++++++++++++++
 .../node-addon-api/src/non-streaming-asr.cc   |  27 +++++
 6 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
 create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js

diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh
index 6cced13bb..867c0f022 100755
--- a/.github/scripts/test-nodejs-addon-npm.sh
+++ b/.github/scripts/test-nodejs-addon-npm.sh
@@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then
   node ./test_asr_non_streaming_nemo_ctc.js
   rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
 
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+
+  node ./test_asr_non_streaming_sense_voice.js
+  rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+
   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
   tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
   rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md
index 29e86800d..04db30825 100644
--- a/nodejs-addon-examples/README.md
+++ b/nodejs-addon-examples/README.md
@@ -95,6 +95,7 @@ The following tables list the examples in this folder.
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
+|[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 
 ## Non-Streaming speech-to-text from a microphone with VAD
 
@@ -104,6 +105,7 @@ The following tables list the examples in this folder.
 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
+|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 
 ## Text-to-speech
 
@@ -252,6 +254,20 @@ npm install naudiodon2
 node ./test_vad_asr_non_streaming_paraformer_microphone.js
 ```
 
+### Non-streaming speech recognition with SenseVoice
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+
+node ./test_asr_non_streaming_sense_voice.js
+
+# To run VAD + non-streaming ASR with Paraformer using a microphone
+npm install naudiodon2
+node ./test_vad_asr_non_streaming_sense_voice_microphone.js
+```
+
 ### Text-to-speech with piper VITS models (TTS)
 
 ```bash
diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json
index 8d525f142..e4bb08801 100644
--- a/nodejs-addon-examples/package.json
+++ b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
   "dependencies": {
-    "sherpa-onnx-node": "^1.0.30"
+    "sherpa-onnx-node": "^1.10.17"
   }
 }
diff --git a/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
new file mode 100644
index 000000000..99371e8f3
--- /dev/null
+++ b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
@@ -0,0 +1,48 @@
+// Copyright (c)  2024  Xiaomi Corporation
+const sherpa_onnx = require('sherpa-onnx-node');
+
+// Please download test files from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const config = {
+  'featConfig': {
+    'sampleRate': 16000,
+    'featureDim': 80,
+  },
+  'modelConfig': {
+    'senseVoice': {
+      'model':
+          './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
+      'useInverseTextNormalization': 1,
+    },
+    'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
+    'numThreads': 2,
+    'provider': 'cpu',
+    'debug': 1,
+  }
+};
+
+const waveFilename =
+    './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
+
+const recognizer = new sherpa_onnx.OfflineRecognizer(config);
+console.log('Started')
+let start = Date.now();
+const stream = recognizer.createStream();
+const wave = sherpa_onnx.readWave(waveFilename);
+stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
+
+recognizer.decode(stream);
+result = recognizer.getResult(stream)
+let stop = Date.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+console.log(waveFilename)
+console.log('result\n', result)
diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js
new file mode 100644
index 000000000..c96cbf0af
--- /dev/null
+++ b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js
@@ -0,0 +1,111 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+//
+const portAudio = require('naudiodon2');
+// console.log(portAudio.getDevices());
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createRecognizer() {
+  // Please download test files from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'senseVoice': {
+        'model':
+            './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
+        'useInverseTextNormalization': 1,
+      },
+      'tokens':
+          './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    }
+  };
+
+  return new sherpa_onnx.OfflineRecognizer(config);
+}
+
+function createVad() {
+  // please download silero_vad.onnx from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+  const config = {
+    sileroVad: {
+      model: './silero_vad.onnx',
+      threshold: 0.5,
+      minSpeechDuration: 0.25,
+      minSilenceDuration: 0.5,
+      windowSize: 512,
+    },
+    sampleRate: 16000,
+    debug: true,
+    numThreads: 1,
+  };
+
+  const bufferSizeInSeconds = 60;
+
+  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
+}
+
+const recognizer = createRecognizer();
+const vad = createVad();
+
+const bufferSizeInSeconds = 30;
+const buffer =
+    new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
+
+const ai = new portAudio.AudioIO({
+  inOptions: {
+    channelCount: 1,
+    closeOnError: true,  // Close the stream if an audio error is detected, if
+                         // set false then just log the error
+    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
+    sampleFormat: portAudio.SampleFormatFloat32,
+    sampleRate: vad.config.sampleRate
+  }
+});
+
+let printed = false;
+let index = 0;
+ai.on('data', data => {
+  const windowSize = vad.config.sileroVad.windowSize;
+  buffer.push(new Float32Array(data.buffer));
+  while (buffer.size() > windowSize) {
+    const samples = buffer.get(buffer.head(), windowSize);
+    buffer.pop(windowSize);
+    vad.acceptWaveform(samples);
+  }
+
+  while (!vad.isEmpty()) {
+    const segment = vad.front();
+    vad.pop();
+    const stream = recognizer.createStream();
+    stream.acceptWaveform({
+      samples: segment.samples,
+      sampleRate: recognizer.config.featConfig.sampleRate
+    });
+    recognizer.decode(stream);
+    const r = recognizer.getResult(stream);
+    if (r.text.length > 0) {
+      const text = r.text.toLowerCase().trim();
+      console.log(`${index}: ${text}`);
+
+      const filename = `${index}-${text}-${
+          new Date()
+              .toLocaleTimeString('en-US', {hour12: false})
+              .split(' ')[0]}.wav`;
+      sherpa_onnx.writeWave(
+          filename,
+          {samples: segment.samples, sampleRate: vad.config.sampleRate});
+
+      index += 1;
+    }
+  }
+});
+
+ai.start();
+console.log('Started! Please speak')
diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc
index db14ef52d..efac28984 100644
--- a/scripts/node-addon-api/src/non-streaming-asr.cc
+++ b/scripts/node-addon-api/src/non-streaming-asr.cc
@@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig(
   return c;
 }
 
+static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig(
+    Napi::Object obj) {
+  SherpaOnnxOfflineSenseVoiceModelConfig c;
+  memset(&c, 0, sizeof(c));
+
+  if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) {
+    return c;
+  }
+
+  Napi::Object o = obj.Get("senseVoice").As<Napi::Object>();
+
+  SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
+  SHERPA_ONNX_ASSIGN_ATTR_STR(language, language);
+  SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization);
+
+  return c;
+}
+
 static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
   SherpaOnnxOfflineModelConfig c;
   memset(&c, 0, sizeof(c));
@@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
   c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o);
   c.whisper = GetOfflineWhisperModelConfig(o);
   c.tdnn = GetOfflineTdnnModelConfig(o);
+  c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
 
   SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
   SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
     delete[] c.model_config.tdnn.model;
   }
 
+  if (c.model_config.sense_voice.model) {
+    delete[] c.model_config.sense_voice.model;
+  }
+
+  if (c.model_config.sense_voice.language) {
+    delete[] c.model_config.sense_voice.language;
+  }
+
   if (c.model_config.tokens) {
     delete[] c.model_config.tokens;
   }