From c3260ef842e2b6be23460e0a51130dde644c2161 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 21 Jul 2024 10:14:14 +0800 Subject: [PATCH] Add JavaScript API for SenseVoice (#1157) --- .github/scripts/test-nodejs-addon-npm.sh | 7 ++ nodejs-addon-examples/README.md | 16 +++ nodejs-addon-examples/package.json | 2 +- .../test_asr_non_streaming_sense_voice.js | 48 ++++++++ ...sr_non_streaming_sense_voice_microphone.js | 111 ++++++++++++++++++ .../node-addon-api/src/non-streaming-asr.cc | 27 +++++ 6 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 nodejs-addon-examples/test_asr_non_streaming_sense_voice.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 6cced13bb..867c0f022 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then node ./test_asr_non_streaming_nemo_ctc.js rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + node ./test_asr_non_streaming_sense_voice.js + rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 29e86800d..04db30825 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -95,6 +95,7 @@ The following tables list the examples in this folder. |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| +|[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| ## Non-Streaming speech-to-text from a microphone with VAD @@ -104,6 +105,7 @@ The following tables list the examples in this folder. |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| +|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| ## Text-to-speech @@ -252,6 +254,20 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_paraformer_microphone.js ``` +### Non-streaming speech recognition with SenseVoice + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +node ./test_asr_non_streaming_sense_voice.js + +# To run VAD + non-streaming ASR with Paraformer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_sense_voice_microphone.js +``` + ### Text-to-speech with piper VITS models (TTS) ```bash diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 8d525f142..e4bb08801 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.0.30" + "sherpa-onnx-node": "^1.10.17" } } diff --git a/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js new file mode 100644 index 000000000..99371e8f3 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'senseVoice': { + 'model': + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', + 'useInverseTextNormalization': 1, + }, + 'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = Date.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js new file mode 100644 index 000000000..c96cbf0af --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js @@ -0,0 +1,111 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'senseVoice': { + 'model': + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', + 'useInverseTextNormalization': 1, + }, + 'tokens': + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}); + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index db14ef52d..efac28984 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( return c; } +static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineSenseVoiceModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("senseVoice").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); + SHERPA_ONNX_ASSIGN_ATTR_STR(language, language); + SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization); + + return c; +} + static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { SherpaOnnxOfflineModelConfig c; memset(&c, 0, sizeof(c)); @@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o); c.whisper = GetOfflineWhisperModelConfig(o); c.tdnn = GetOfflineTdnnModelConfig(o); + c.sense_voice = GetOfflineSenseVoiceModelConfig(o); SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); @@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { delete[] c.model_config.tdnn.model; } + if (c.model_config.sense_voice.model) { + delete[] c.model_config.sense_voice.model; + } + + if (c.model_config.sense_voice.language) { + delete[] c.model_config.sense_voice.language; + } + if (c.model_config.tokens) { delete[] c.model_config.tokens; }