[runtime/xpu] 🐻Support the execution of non-streaming parsing on the …

…Kunlun XPU card #1455
wenet-e2e · Oct 27, 2022 · 9b8915d · 9b8915d
1 parent 0befece
commit 9b8915d
Show file tree

Hide file tree

Showing 28 changed files with 3,463 additions and 6 deletions.
diff --git a/runtime/core/cmake/xpu.cmake b/runtime/core/cmake/xpu.cmake
@@ -0,0 +1,37 @@
+if(NOT WIN32)
+  string(ASCII 27 Esc)
+  set(ColourReset "${Esc}[m")
+  set(ColourBold  "${Esc}[1m")
+  set(Red         "${Esc}[31m")
+  set(Green       "${Esc}[32m")
+  set(Yellow      "${Esc}[33m")
+  set(Blue        "${Esc}[34m")
+  set(Magenta     "${Esc}[35m")
+  set(Cyan        "${Esc}[36m")
+  set(White       "${Esc}[37m")
+  set(BoldRed     "${Esc}[1;31m")
+  set(BoldGreen   "${Esc}[1;32m")
+  set(BoldYellow  "${Esc}[1;33m")
+  set(BoldBlue    "${Esc}[1;34m")
+  set(BoldMagenta "${Esc}[1;35m")
+  set(BoldCyan    "${Esc}[1;36m")
+  set(BoldWhite   "${Esc}[1;37m")
+endif()
+
+if(XPU)
+  set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+  message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n")
+  set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu)
+  if(NOT DEFINED ENV{XPU_API_PATH})
+    message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n")
+  else()
+    set(XPU_API_PATH $ENV{XPU_API_PATH})
+    message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.")
+  endif()
+
+  include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/
+                      ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include)
+  link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/)
+
+  add_definitions(-DUSE_XPU)
+endif()
diff --git a/runtime/core/decoder/CMakeLists.txt b/runtime/core/decoder/CMakeLists.txt
@@ -7,8 +7,8 @@ set(decoder_srcs
   ctc_endpoint.cc
 )
 
-if(NOT TORCH AND NOT ONNX)
-  message(FATAL_ERROR "Please build with TORCH or ONNX!!!")
+if(NOT TORCH AND NOT ONNX AND NOT XPU)
+  message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!")
 endif()
 if(TORCH)
   list(APPEND decoder_srcs torch_asr_model.cc)
@@ -18,7 +18,8 @@ if(ONNX)
 endif()
 
 add_library(decoder STATIC ${decoder_srcs})
-target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils)
+target_link_libraries(decoder PUBLIC kaldi-decoder frontend
+                      post_processor utils)
 
 if(ANDROID)
   target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY})
@@ -29,4 +30,7 @@ else()
   if(ONNX)
     target_link_libraries(decoder PUBLIC onnxruntime)
   endif()
+  if(XPU)
+    target_link_libraries(decoder PUBLIC xpu_conformer)
+  endif()
 endif()
diff --git a/runtime/core/decoder/params.h b/runtime/core/decoder/params.h
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #ifndef DECODER_PARAMS_H_
 #define DECODER_PARAMS_H_
 
@@ -29,17 +28,24 @@
 #ifdef USE_TORCH
 #include "decoder/torch_asr_model.h"
 #endif
+#ifdef USE_XPU
+#include "xpu/xpu_asr_model.h"
+#endif
 #include "frontend/feature_pipeline.h"
 #include "post_processor/post_processor.h"
 #include "utils/flags.h"
 #include "utils/string.h"
 
 DEFINE_int32(num_threads, 1, "num threads for ASR model");
+DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model");
 
 // TorchAsrModel flags
 DEFINE_string(model_path, "", "pytorch exported model path");
 // OnnxAsrModel flags
 DEFINE_string(onnx_dir, "", "directory where the onnx model is saved");
+// XPUAsrModel flags
+DEFINE_string(xpu_model_dir, "",
+              "directory where the XPU model and weights is saved");
 
 // FeaturePipelineConfig flags
 DEFINE_int32(num_bins, 80, "num mel bins for fbank feature");
@@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search");
 DEFINE_double(blank_skip_thresh, 1.0,
               "blank skip thresh for ctc wfst search, 1.0 means no skip");
-DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not"
+DEFINE_double(length_penalty, 0.0,
+              "length penalty ctc wfst search, will not"
               "apply on self-loop arc, for balancing the del/ins ratio, "
               "suggest set to -3.0");
 DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
@@ -130,7 +137,7 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
 #else
     LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'.";
 #endif
-  } else {
+  } else if (!FLAGS_model_path.empty()) {
 #ifdef USE_TORCH
     LOG(INFO) << "Reading torch model " << FLAGS_model_path;
     TorchAsrModel::InitEngineThreads(FLAGS_num_threads);
@@ -140,6 +147,19 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
 #else
     LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'.";
 #endif
+  } else if (!FLAGS_xpu_model_dir.empty()) {
+#ifdef USE_XPU
+    LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir;
+    auto model = std::make_shared<XPUAsrModel>();
+    model->SetEngineThreads(FLAGS_num_threads);
+    model->SetDeviceId(FLAGS_device_id);
+    model->Read(FLAGS_xpu_model_dir);
+    resource->model = model;
+#else
+    LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'.";
+#endif
+  } else {
+    LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!";
   }
 
   LOG(INFO) << "Reading unit table " << FLAGS_unit_path;

diff --git a/runtime/kunlun/.gitignore b/runtime/kunlun/.gitignore
@@ -0,0 +1,2 @@
+build/
+fc_base/
diff --git a/runtime/kunlun/CMakeLists.txt b/runtime/kunlun/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(wenet VERSION 0.1)
+
+option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF)
+option(GRAPH_TOOLS "whether to build TLG graph tools" OFF)
+option(BUILD_TESTING "whether to build unit test" OFF)
+
+option(GRPC "whether to build with gRPC" OFF)
+# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost
+#                     which is a very big library
+option(WEBSOCKET "whether to build with websocket" OFF)
+option(XPU "whether to build with XPU" ON)
+
+set(CMAKE_VERBOSE_MAKEFILE OFF)
+
+include(FetchContent)
+set(FETCHCONTENT_QUIET OFF)
+get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_base})
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")
+
+# Include all dependency
+include(openfst)
+# This CMakeLists.txt is only used for kunlun xpu, so remove the contents
+#  about onnx, libtorch, gpu and windows.
+include(xpu)
+# Compile xpu_conformer.a and conformer_test
+add_subdirectory(xpu)
+
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/kaldi
+)
+
+# Build all libraries
+add_subdirectory(utils)
+if(NOT MSVC)
+  add_dependencies(utils openfst)
+endif()
+add_subdirectory(frontend)
+add_subdirectory(post_processor)
+add_subdirectory(kaldi)  # kaldi: wfst based decoder
+add_subdirectory(decoder)
+add_subdirectory(api)
+
+# Optionally, you can build with websocket
+if(WEBSOCKET)
+  include(boost)
+  add_subdirectory(websocket)
+endif()
+
+# Optionally, you can build with gRPC
+if(GRPC)
+  include(grpc)
+  add_subdirectory(grpc)
+endif()
+
+# Build all bins
+add_subdirectory(bin)
+
+# Unit Test
+if(BUILD_TESTING)
+  include(gtest)
+  add_subdirectory(test)
+endif()
diff --git a/runtime/kunlun/README.md b/runtime/kunlun/README.md
@@ -0,0 +1,83 @@
+# 在昆仑芯片上运行Wenet
+## 介绍
+下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。
+
+## 准备XPU运行环境
+
+在开始之前，请确认您获得以下必须的环境。
+
+    XRE(XPU Runtime Environment):昆仑芯片的基础运行环境，包括芯片驱动程序、runtime api库、固件FW工具等功能模块。
+    XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库，提供应用程序中使用的高性能DNN功能库。
+
+如果您需要任何帮助，或是想要进一步了解昆仑芯片，请通过官方网址联系我们：
+https://www.kunlunxin.com.cn/
+
+## 操作步骤
+- 第一步：构建，需要cmake 3.14及以上版本
+
+``` sh
+export CXX=${your_g++_path}
+export CC=${your_gcc_path}
+export XPU_API_PATH=${your_api_path}
+
+# -r : release version; -d : debug version
+bash ./compile.sh -r
+```
+
+- 第二步：测试，测试结果将在控制台输出
+
+``` sh
+## set KUNLUN XPU visible device
+export XPU_VISIBLE_DEVICES=0
+export XPUSIM_DEVICE_MODEL=KUNLUN2
+## set logging level
+export GLOG_logtostderr=1
+export GLOG_v=3
+## set speech wav and model/weight path
+wav_path=${your_test_wav_path}
+xpu_model_dir=${your_xpu_weight_dir}
+units=${your_units.txt}
+## executive command
+./build/bin/decoder_main \
+    --chunk_size -1 \
+    --wav_path ${wav_path} \
+    --xpu_model_dir ${xpu_model_di} \
+    --unit_path ${units}   \
+    --device_id 0           \
+    --nbest  3  2>&1 | tee log.txt
+```
+
+单条语音执行结果如下所示:
+
+``` sh
+XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
+I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
+I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
+I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
+I1027 06:06:23.832749 111767 xpu_asr_model.cc:66]       subsampling_rate 4
+I1027 06:06:23.832777 111767 xpu_asr_model.cc:67]       right_context 6
+I1027 06:06:23.832789 111767 xpu_asr_model.cc:68]       sos 5538
+I1027 06:06:23.832795 111767 xpu_asr_model.cc:69]       eos 5538
+I1027 06:06:23.832799 111767 xpu_asr_model.cc:70]       is bidirectional decoder 1
+I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
+I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
+I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
+I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
+I1027 06:06:23.843616 111776 xpu_asr_model.cc:173]       max_seqlen is 418
+I1027 06:06:23.843619 111776 xpu_asr_model.cc:174]       q_seqlen   is 103
+I1027 06:06:23.843623 111776 xpu_asr_model.cc:175]       att_dim    is 512
+I1027 06:06:23.843626 111776 xpu_asr_model.cc:176]       ctc_dim    is 5538
+I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
+I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852537 111776 xpu_asr_model.cc:248]       num_hyps  is 3
+I1027 06:06:23.852541 111776 xpu_asr_model.cc:249]       beam_size is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:250]       new_bs    is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:251]       max_hyps_len is 14
+I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
+I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
+test 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
+```
diff --git a/runtime/kunlun/README_EN.md b/runtime/kunlun/README_EN.md
@@ -0,0 +1,87 @@
+# WeNet running on KUNLUNXIN XPU device
+## Introduction
+The below example shows how to deploy WeNet offline and online ASR models on XPUs.
+XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing.
+
+## Setup environment for XPU device
+
+Before the start, makesure you have these necessary environment
+
+    XRE(XPU Runtime Environment):The basic operating environment of the XPUs
+    includes functional modules such as chip drivers, runtime api library, and firmware tools.
+
+    XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications.
+
+If you would like to know more about XPUs or need any help, please contact us through the official website:
+
+https://www.kunlunxin.com.cn/
+
+## Instruction
+- Step 1. Build, the build requires cmake 3.14 or above.
+
+``` sh
+export CXX=${your_g++_path}
+export CC=${your_gcc_path}
+export XPU_API_PATH=${your_api_path}
+
+# -r : release version; -d : debug version
+bash ./compile.sh -r
+```
+
+- Step 2. Testing, the result is shown in the console.
+
+``` sh
+## set KUNLUN XPU visible device
+export XPU_VISIBLE_DEVICES=0
+export XPUSIM_DEVICE_MODEL=KUNLUN2
+## set logging level
+export GLOG_logtostderr=1
+export GLOG_v=3
+## set speech wav and model/weight/units path
+wav_path=${your_test_wav_path}
+xpu_model_dir=${your_xpu_weight_dir}
+units=${your_units.txt}
+## executive command
+./build/bin/decoder_main \
+    --chunk_size -1 \
+    --wav_path $wav_path \
+    --xpu_model_dir $xpu_model_dir \
+    --unit_path $units   \
+    --device_id 0           \
+    --nbest  3  2>&1 | tee log.txt
+```
+
+A typical output result is as following:
+
+``` sh
+XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
+I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
+I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
+I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
+I1027 06:06:23.832749 111767 xpu_asr_model.cc:66]       subsampling_rate 4
+I1027 06:06:23.832777 111767 xpu_asr_model.cc:67]       right_context 6
+I1027 06:06:23.832789 111767 xpu_asr_model.cc:68]       sos 5538
+I1027 06:06:23.832795 111767 xpu_asr_model.cc:69]       eos 5538
+I1027 06:06:23.832799 111767 xpu_asr_model.cc:70]       is bidirectional decoder 1
+I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
+I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
+I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
+I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
+I1027 06:06:23.843616 111776 xpu_asr_model.cc:173]       max_seqlen is 418
+I1027 06:06:23.843619 111776 xpu_asr_model.cc:174]       q_seqlen   is 103
+I1027 06:06:23.843623 111776 xpu_asr_model.cc:175]       att_dim    is 512
+I1027 06:06:23.843626 111776 xpu_asr_model.cc:176]       ctc_dim    is 5538
+I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
+I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852537 111776 xpu_asr_model.cc:248]       num_hyps  is 3
+I1027 06:06:23.852541 111776 xpu_asr_model.cc:249]       beam_size is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:250]       new_bs    is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:251]       max_hyps_len is 14
+I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
+I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
+test 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
+```
diff --git a/runtime/kunlun/api b/runtime/kunlun/api
@@ -0,0 +1 @@
+../core/api
diff --git a/runtime/kunlun/bin b/runtime/kunlun/bin
@@ -0,0 +1 @@
+../core/bin
diff --git a/runtime/kunlun/cmake b/runtime/kunlun/cmake
@@ -0,0 +1 @@
+../core/cmake