serenadeai · tmacwill · May 7, 2021 · May 5, 2021 · May 5, 2021 · May 5, 2021
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ If you're using this library with Electron, you should probably use [electron-re
 
 ## Usage
 
+This library uses two voice activity detection mechanisms: a fast first pass (the WebRTC VAD), and a slightly slower, but much more accurate, second pass (the Silero VAD). See below for the various options you can supply to each.
+
 ### Devices
 
 You can get a list of supported devices with:
@@ -43,7 +45,7 @@ Or, just the speech with:
 
     import { SpeechRecorder } from "speech-recorder";
 
-    const recorder = new SpeechRecorder({ sampleRate: 16000, framesPerBuffer: 320 });
+    const recorder = new SpeechRecorder({ framesPerBuffer: 320 });
     const writeStream = fs.createWriteStream("audio.raw");
 
     recorder.start({
@@ -54,34 +56,38 @@ Or, just the speech with:
       }
     });
 
-As you can see, `onSpeech` will be called whenever speech is detected, and `onAudio` will be called regardless (i.e., on every frame).
+### Options
 
 The `SpeechRecorder` constructor supports the following options:
 
--   `error`: callback called on audio stream error. defaults to `null`.
--   `framesPerBuffer`: the number of audio frames to read at a time. defaults to `320`.
--   `highWaterMark`: the `highWaterMark` to be applied to the underlying stream, or how much audio can be buffered in memory. defaults to `64000` (64kb).
--   `leadingPadding`: the number of frames to buffer at the start of a speech chunk. this can be prevent audio at the start of the file from getting cut off. defaults to `30`.
--   `level`: the VAD aggressiveness level on a scale of 0-3, with 0 being the least aggressive and 3 being the most aggressive. defaults to `3`.
--   `sampleRate`: the sample rate for the audio; must be 8000, 16000, 32000, or 48000. defaults to `16000`.
--   `speakingThreshold`: the number of consecutive speaking frames before considering speech to have started.
--   `silenceThreshold`: the number of consecutive non-speaking frames before considering speech to be finished.
--   `triggers`: a list of `Trigger` objects that can optionally specify when the `onTrigger` callback is executed.
+- `disableSecondPass`: whether or not to disable the second pass. defaults to `false`.
+- `error`: callback called on audio stream error. defaults to `null`.
+- `framesPerBuffer`: the number of audio frames to read at a time. defaults to `320`.
+- `highWaterMark`: the `highWaterMark` to be applied to the underlying stream, or how much audio can be buffered in memory. defaults to `64000` (64kb).
+- `leadingPadding`: the number of frames to buffer at the start of a speech chunk. this can be prevent audio at the start of the file from getting cut off. defaults to `20`.
+- `firstPassFilter`: the level of aggressiveness for the first-pass filter on a scale of 0-3, with 0 being the least aggressive and 3 being the most aggressive. defaults to `3`.
+- `minimumVolume`: a minimum volume threshold for speech.
+- `speakingThreshold`: the number of consecutive speaking frames before considering speech to have started. defaults to `1`.
+- `silenceThreshold`: the number of consecutive non-speaking buffers before considering speech to be finished. defaults to `10`.
+- `triggers`: a list of `Trigger` objects that can optionally specify when the `onTrigger` callback is executed.
+- `vadBufferSize`: the number of buffers to pass to the second-pass VAD. i.e., the number of frames passed to the VAD is `framesPerBuffer * vadBufferSize`.
+- `vadThreshold`: the probability cutoff, between 0–1, for the second-pass VAD. defaults to `0.75`. e.g., a value of `0.9` will only consider a buffer to be speech if the VAD is 90% confident.
 
 The `start` method supports the following options:
 
--   `deviceId`: `id` value from `getDevices` corresponding to the device you want to use; a value of `-1` uses the default device.
--   `onAudio`: a callback to be executed when audio data is received from the mic. will be passed `(audio, speaking, speech, volume, silence)`, where `audio` is the buffer of audio data, `speaking` is whether or not we're in the speaking state, `speech` is whether the current frame is speech (recall that consecutive non-speaking frames must be found to exit the speaking state, so `speaking` and `speech` can be different), `volume` is the volume of the audio, and `silence` is the number of consecutive silence frames that have been heard.
--   `onChunkStart`: a callback to be executed when a speech chunk starts. will be passed the leading buffer, whose size is determined by `leadingPadding`.
--   `onChunkEnd`: a callback to be executed when a speech chunk ends.
--   `onTrigger`: a callback to be executed when a trigger threshold is met.
+- `deviceId`: `id` value from `getDevices` corresponding to the device you want to use; a value of `-1` uses the default device.
+- `onAudio`: a callback to be executed when audio data is received from the mic. will be passed `(audio, speaking, speech, volume, silence)`, where `audio` is the buffer of audio data, `speaking` is whether or not we're in the speaking state, `speech` is whether the current frame is speech (recall that consecutive non-speaking frames must be found to exit the speaking state, so `speaking` and `speech` can be different), `volume` is the volume of the audio, and `silence` is the number of consecutive silence frames that have been heard.
+- `onChunkStart`: a callback to be executed when a speech chunk starts. will be passed the leading buffer, whose size is determined by `leadingPadding`.
+- `onChunkEnd`: a callback to be executed when a speech chunk ends.
+- `onTrigger`: a callback to be executed when a trigger threshold is met.
 
 ### Examples
 
 See the `examples/` directory for example usages.
 
 ## Credits
 
--   speech-recorder uses [PortAudio](http://portaudio.com/) for native microphone access.
--   speech-recorder uses [webrtcvad](https://github.com/serenadeai/webrtcvad) for detecting voice.
--   speech-recorder is based on [node-portaudio](https://github.com/auroraapi/node-portaudio), which in turn is based on [naudiodon](https://github.com/Streampunk/naudiodon).
+- speech-recorder uses [PortAudio](http://portaudio.com/) for native microphone access.
+- speech-recorder uses [webrtcvad](https://github.com/serenadeai/webrtcvad) as a first-pass filter for voice detection.
+- speech-recorder users [silero-vad](https://github.com/snakers4/silero-vad) for detecting voice.
+- speech-recorder is based on [node-portaudio](https://github.com/auroraapi/node-portaudio), which in turn is based on [naudiodon](https://github.com/Streampunk/naudiodon).
diff --git a/examples/live.js b/examples/live.js
@@ -0,0 +1,9 @@
+const { SpeechRecorder } = require("../dist/index");
+
+console.log("Recording...");
+const recorder = new SpeechRecorder();
+recorder.start({
+  onAudio: (audio, speech, speaking, volume, silence, probability) => {
+    console.log(Date.now(), speech, probability);
+  },
+});
diff --git a/examples/speech.js b/examples/speech.js
@@ -5,10 +5,16 @@ console.log("Recording...");
 const recorder = new SpeechRecorder();
 const writeStream = fs.createWriteStream("audio.raw");
 recorder.start({
-  onSpeech: (audio, speech) => {
-    console.log(new Date(), audio.length, state);
+  onChunkStart: (leadingBuffer) => {
+    writeStream.write(leadingBuffer);
+  },
+  onAudio: (audio, speech) => {
     if (speech) {
       writeStream.write(audio);
     }
   },
+  onChunkEnd: () => {
+    writeStream.end();
+    process.exit(0);
+  },
 });
diff --git a/index.ts b/index.ts
@@ -1,8 +1,9 @@
 import bindings from "bindings";
 import * as os from "os";
 import { Readable } from "stream";
-import VAD from "webrtcvad";
+import WebrtcVad from "webrtcvad";
 import uuid from "uuid/v4";
+import SileroVad from "./vad";
 const portAudioBindings = bindings("portaudio.node");
 
 export type Trigger = {
@@ -48,23 +49,31 @@ class AudioStream extends Readable {
 export class SpeechRecorder {
   private audioStarted = false;
   private audioStream?: AudioStream;
-  private chunk: string = "";
   private consecutiveSpeech: number = 0;
   private consecutiveSilence: number = 0;
+  private disableSecondPass: boolean = false;
   private error: null | ((e: any) => void) = null;
   private framesPerBuffer: number = 320;
   private highWaterMark: number = 64000;
   private leadingBuffer: Buffer[] = [];
-  private leadingPadding: number = 30;
-  private minimumVolume: number = 250;
+  private leadingPadding: number = 20;
+  private minimumVolume: number = 200;
   private sampleRate: number = 16000;
   private speaking: boolean = false;
-  private speakingThreshold: number = 5;
-  private silenceThreshold: number = 30;
+  private speakingThreshold: number = 1;
+  private silenceThreshold: number = 10;
   private triggers: Trigger[] = [];
-  private vad: VAD;
+  private webrtcVad: WebrtcVad;
+  private vad = new SileroVad();
+  private vadBuffer: number[][] = [];
+  private vadBufferSize: number = 10;
+  private vadThreshold: number = 0.75;
 
   constructor(options: any = {}) {
+    if (options.disableSecondPass !== undefined) {
+      this.disableSecondPass = options.disableSecondPass;
+    }
+
     if (options.error) {
       this.error = options.error;
     }
@@ -85,10 +94,6 @@ export class SpeechRecorder {
       this.minimumVolume = options.minimumVolume;
     }
 
-    if (options.sampleRate !== undefined) {
-      this.sampleRate = options.sampleRate;
-    }
-
     if (options.silenceThreshold !== undefined) {
       this.silenceThreshold = options.silenceThreshold;
     }
@@ -101,18 +106,42 @@ export class SpeechRecorder {
       this.triggers = options.triggers;
     }
 
-    this.vad = new VAD(this.sampleRate, options.level || 3);
+    if (options.vadBufferSize !== undefined) {
+      this.vadBufferSize = options.vadBufferSize;
+    }
+
+    if (options.vadThreshold !== undefined) {
+      this.vadThreshold = options.vadThreshold;
+    }
+
+    this.webrtcVad = new WebrtcVad(this.sampleRate, options.firstPassLevel || 3);
   }
 
-  onData(startOptions: any, audio: any) {
+  async onData(startOptions: any, audio: any) {
     let sum = 0;
+    let normalized: number[] = [];
     for (let i = 0; i < audio.length; i += 2) {
-      sum += Math.pow(audio.readInt16LE(i), 2);
+      const e = audio.readInt16LE(i);
+      sum += Math.pow(e, 2);
+      normalized.push(e / 32767);
+    }
+
+    this.vadBuffer.push(normalized);
+    while (this.vadBuffer.length > this.vadBufferSize) {
+      this.vadBuffer.shift();
     }
 
     // require a minimum (very low) volume threshold as well as a positive VAD result
     const volume = Math.floor(Math.sqrt(sum / (audio.length / 2)));
-    const speaking = !!(this.vad.process(audio) && volume > this.minimumVolume);
+    let speaking = !!(this.webrtcVad.process(audio) && volume > this.minimumVolume);
+    let probability = speaking ? 1 : 0;
+
+    // double-check the WebRTC VAD with the Silero VAD
+    if (!this.disableSecondPass && speaking && this.vadBuffer.length == this.vadBufferSize) {
+      probability = await this.vad.process([].concat(...this.vadBuffer));
+      speaking = probability > this.vadThreshold;
+    }
+
     if (speaking) {
       this.consecutiveSilence = 0;
       this.consecutiveSpeech++;
@@ -152,7 +181,8 @@ export class SpeechRecorder {
         this.speaking,
         speaking,
         volume,
-        this.audioStarted ? this.consecutiveSilence : 0
+        this.audioStarted ? this.consecutiveSilence : 0,
+        probability
       );
     }
 

diff --git a/model/silero.onnx b/model/silero.onnx
diff --git a/package.json b/package.json
@@ -7,31 +7,33 @@
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
   "gypfile": true,
-  "keywords": ["portaudio", "audio", "record", "pcm"],
+  "keywords": [
+    "portaudio",
+    "audio",
+    "record",
+    "pcm"
+  ],
   "bugs": {
     "url": "https://github.com/serenadeai/speech-recorder/issues"
   },
   "repository": "git+https://github.com/serenadeai/speech-recorder.git",
   "dependencies": {
     "bindings": "^1.3.0",
     "nan": "^2.14.1",
+    "onnxruntime": "^1.7.0",
     "uuid": "^3.3.3",
     "webrtcvad": "^1.0.1"
   },
   "scripts": {
     "build": "tsc",
     "clean": "rm -rf build dist bin 9* 10* 11* *.raw",
     "prepare": "tsc",
-    "install": "node-gyp rebuild",
-    "test": "tsc ; mocha"
+    "install": "node-gyp rebuild"
   },
   "devDependencies": {
     "@types/bindings": "^1.3.0",
-    "@types/meyda": "^4.3.0",
-    "@types/mocha": "^7.0.1",
     "@types/node": "^12.11.5",
     "@types/uuid": "^3.4.6",
-    "mocha": "^7.0.1",
     "typescript": "^3.6.4"
   }
 }
diff --git a/test/test.js b/test/test.js
diff --git a/vad.ts b/vad.ts
@@ -0,0 +1,24 @@
+const ort = require("onnxruntime");
+
+export default class SileroVad {
+  private loaded = false;
+  private session: any;
+
+  private async load() {
+    if (this.loaded) {
+      return;
+    }
+
+    this.session = await ort.InferenceSession.create(`${__dirname}/../model/silero.onnx`);
+    this.loaded = true;
+  }
+
+  async process(audio: number[], batchSize = 1): Promise<number> {
+    await this.load();
+    const result = await this.session.run({
+      input: new ort.Tensor(Float32Array.from(audio), [batchSize, audio.length / batchSize]),
+    });
+
+    return result.output.data[1];
+  }
+}