add audio_vad on macos

litongjava · Nov 25, 2023 · 089924c · 089924c
1 parent d7a3292
commit 089924c
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 26 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,16 +21,10 @@ include_directories(${SDL2_INCLUDE_DIRS})
 
 find_package(SampleRate CONFIG REQUIRED)
 find_package(FFMPEG REQUIRED)
-# webrtc
-include_directories(webrtc)
-include_directories(.)
-# find cpp files
-file(GLOB VAD_FILES simplevad/*.c simplevad/*.h
-        webrtc/common_audio/*/*.c webrtc/rtc_base/*.c*)
-
-add_executable(audio_vad examples/audio_vad.cpp ${VAD_FILES})
-target_link_libraries(audio_vad pthread)
-
+# 查找 SpeexDSP 库
+find_library(SPEEXDSP_LIBRARY NAMES speexdsp)
+# 查找头文件
+find_path(SPEEXDSP_INCLUDE_DIRS "speex/speex_preprocess.h")
 
 # Detecting Operating Systems
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -48,12 +42,17 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     link_directories(E:\\code\\cpp\\project-ping\\whisper.cpp\\cmake-build-release\\bin)
 endif ()
 
+add_executable(audio_vad examples/audio_vad.cpp common/common.cpp
+        stream/stream_components_service.cpp common/utils.cpp)
+target_link_libraries(audio_vad PRIVATE whisper SampleRate::samplerate ${SPEEXDSP_LIBRARY})
+# 链接头文件
+target_include_directories(audio_vad PRIVATE ${SPEEXDSP_INCLUDE_DIRS})
 
 add_executable(sdl_version examples/sdl_version.cpp)
-target_link_libraries(sdl_version ${SDL2_LIBRARIES})
+target_link_libraries(sdl_version PRIVATE ${SDL2_LIBRARIES})
 
 add_executable(simplest examples/simplest.cpp common/common.cpp common/utils.cpp)
-target_link_libraries(simplest whisper SampleRate::samplerate)
+target_link_libraries(simplest PRIVATE whisper SampleRate::samplerate)
 
 add_executable(stream_local examples/stream_local.cpp common/common.cpp common/common-sdl.cpp common/utils.cpp
         stream/stream_components_service.cpp stream/stream_components_audio.cpp

diff --git a/common/common.cpp b/common/common.cpp
@@ -10,7 +10,7 @@
 
 #define DR_MP3_IMPLEMENTATION
 
-#include "dr_libs/dr_mp3.h"
+#include "../dr_libs/dr_mp3.h"
 #include <samplerate.h>
 #include <cmath>
 #include <cstring>

diff --git a/examples/audio_vad.cpp b/examples/audio_vad.cpp
@@ -1,15 +1,68 @@
-#include <cstdio>
-#include <cstdlib>
 #include <iostream>
-#include <memory>
-#include <filesystem>
-
-int main(int argc, char **argv) {
-//default cmake-build-debug/main
-  const char filename[] = "../pcm/16k_1.pcm";
-  const char output_dir[] = "output_pcm";
-  const char output_filename_prefix[] = "16k_1.pcm";
-  if (!std::filesystem::exists(output_dir)) {
-    std::filesystem::create_directories(output_dir);
+#include <vector>
+#include <cstdint>
+#include <whisper.h>
+
+#include "../stream/stream_components_service.h"
+#include "../stream/stream_components.h"
+#include "../common/utils.h"
+#include "../common/common.h"
+#include <speex/speex_preprocess.h>
+
+using namespace stream_components;
+
+
+int main() {
+  std::string wav_file_path = "../samples/jfk.wav";  // 替换为您的 WAV 文件路径
+  // audio arrays
+  std::vector<float> pcmf32;               // mono-channel F32 PCM
+  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+  ::read_wav(wav_file_path, pcmf32, pcmf32s, false);
+
+  printf("size of samples %lu\n", pcmf32.size());
+
+
+  whisper_local_stream_params params;
+  struct whisper_context_params cparams{};
+  cparams.use_gpu = params.service.use_gpu;
+  //Instantiate the service
+  stream_components::WhisperService whisperService(params.service, params.audio, cparams);
+
+  //Simulate websokcet by adding 1500 data each time.
+  std::vector<float> audio_buffer;
+  int chunk_size = 160; // 适用于 16 kHz 采样率的 100 毫秒帧
+  SpeexPreprocessState *st = speex_preprocess_state_init(chunk_size, WHISPER_SAMPLE_RATE);
+
+  int vad = 1;
+  speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_VAD, &vad);
+
+  bool last_is_speech = false;
+  // 处理音频帧
+  for (size_t i = 0; i < pcmf32.size(); i += chunk_size) {
+    spx_int16_t frame[chunk_size];
+    for (int j = 0; j < chunk_size; ++j) {
+      if (i + j < pcmf32.size()) {
+        frame[j] = (spx_int16_t)(pcmf32[i + j] * 32768);
+      } else {
+        frame[j] = 0; // 对于超出范围的部分填充 0
+      }
+    }
+    int is_speech = speex_preprocess_run(st, frame);
+
+    // 将当前帧添加到 audio_buffer
+    audio_buffer.insert(audio_buffer.end(), pcmf32.begin() + i, pcmf32.begin() + std::min(i + chunk_size, pcmf32.size()));
+    printf("is_speech %d \n",is_speech);
+    if (!is_speech && last_is_speech) {
+      bool b = whisperService.process(pcmf32.data(), pcmf32.size());
+      const nlohmann::json &json_array = get_result(whisperService.ctx);
+      const std::basic_string<char, std::char_traits<char>, std::allocator<char>> &string = json_array.dump();
+      printf("%s\n",string.c_str());
+      return 0;
+      audio_buffer.clear();
+    }
+
+    last_is_speech = is_speech != 0;
   }
+
+  speex_preprocess_state_destroy(st);
 }
diff --git a/vcpkg.json b/vcpkg.json
@@ -16,5 +16,8 @@
   }, {
     "name" : "ffmpeg",
     "version>=" : "6.1"
+  }, {
+    "name" : "speexdsp",
+    "version>=" : "1.2.1#1"
   } ]
 }
diff --git a/whisper_server_base_on_uwebsockets.cpp b/whisper_server_base_on_uwebsockets.cpp
@@ -191,7 +191,7 @@ int main(int argc, char **argv) {
       } else {
         // asr
         isOk = whisperService.process(audioBuffer.data(), audioBuffer.size());
-//        audioBuffer.clear();
+        audioBuffer.clear();
       }
       printf("%s: is_ok: %d \n", get_current_time().c_str(), isOk);
       if (isOk) {