Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[runtime/xpu] Support the execution of non-streaming parsing on the Kunlun XPU card #1455

Merged
merged 1 commit into from
Oct 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions runtime/core/cmake/xpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
if(NOT WIN32)
string(ASCII 27 Esc)
set(ColourReset "${Esc}[m")
set(ColourBold "${Esc}[1m")
set(Red "${Esc}[31m")
set(Green "${Esc}[32m")
set(Yellow "${Esc}[33m")
set(Blue "${Esc}[34m")
set(Magenta "${Esc}[35m")
set(Cyan "${Esc}[36m")
set(White "${Esc}[37m")
set(BoldRed "${Esc}[1;31m")
set(BoldGreen "${Esc}[1;32m")
set(BoldYellow "${Esc}[1;33m")
set(BoldBlue "${Esc}[1;34m")
set(BoldMagenta "${Esc}[1;35m")
set(BoldCyan "${Esc}[1;36m")
set(BoldWhite "${Esc}[1;37m")
endif()

if(XPU)
set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR})
message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n")
set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu)
if(NOT DEFINED ENV{XPU_API_PATH})
message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n")
else()
set(XPU_API_PATH $ENV{XPU_API_PATH})
message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.")
endif()

include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/
${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include)
link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/)

add_definitions(-DUSE_XPU)
endif()
10 changes: 7 additions & 3 deletions runtime/core/decoder/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ set(decoder_srcs
ctc_endpoint.cc
)

if(NOT TORCH AND NOT ONNX)
message(FATAL_ERROR "Please build with TORCH or ONNX!!!")
if(NOT TORCH AND NOT ONNX AND NOT XPU)
message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!")
endif()
if(TORCH)
list(APPEND decoder_srcs torch_asr_model.cc)
Expand All @@ -18,7 +18,8 @@ if(ONNX)
endif()

add_library(decoder STATIC ${decoder_srcs})
target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils)
target_link_libraries(decoder PUBLIC kaldi-decoder frontend
post_processor utils)

if(ANDROID)
target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY})
Expand All @@ -29,4 +30,7 @@ else()
if(ONNX)
target_link_libraries(decoder PUBLIC onnxruntime)
endif()
if(XPU)
target_link_libraries(decoder PUBLIC xpu_conformer)
endif()
endif()
26 changes: 23 additions & 3 deletions runtime/core/decoder/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.


#ifndef DECODER_PARAMS_H_
#define DECODER_PARAMS_H_

Expand All @@ -29,17 +28,24 @@
#ifdef USE_TORCH
#include "decoder/torch_asr_model.h"
#endif
#ifdef USE_XPU
#include "xpu/xpu_asr_model.h"
#endif
#include "frontend/feature_pipeline.h"
#include "post_processor/post_processor.h"
#include "utils/flags.h"
#include "utils/string.h"

DEFINE_int32(num_threads, 1, "num threads for ASR model");
DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model");

// TorchAsrModel flags
DEFINE_string(model_path, "", "pytorch exported model path");
// OnnxAsrModel flags
DEFINE_string(onnx_dir, "", "directory where the onnx model is saved");
// XPUAsrModel flags
DEFINE_string(xpu_model_dir, "",
"directory where the XPU model and weights is saved");

// FeaturePipelineConfig flags
DEFINE_int32(num_bins, 80, "num mel bins for fbank feature");
Expand All @@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search");
DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search");
DEFINE_double(blank_skip_thresh, 1.0,
"blank skip thresh for ctc wfst search, 1.0 means no skip");
DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not"
DEFINE_double(length_penalty, 0.0,
"length penalty ctc wfst search, will not"
"apply on self-loop arc, for balancing the del/ins ratio, "
"suggest set to -3.0");
DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
Expand Down Expand Up @@ -130,7 +137,7 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
#else
LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'.";
#endif
} else {
} else if (!FLAGS_model_path.empty()) {
#ifdef USE_TORCH
LOG(INFO) << "Reading torch model " << FLAGS_model_path;
TorchAsrModel::InitEngineThreads(FLAGS_num_threads);
Expand All @@ -140,6 +147,19 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
#else
LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'.";
#endif
} else if (!FLAGS_xpu_model_dir.empty()) {
#ifdef USE_XPU
LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir;
auto model = std::make_shared<XPUAsrModel>();
model->SetEngineThreads(FLAGS_num_threads);
model->SetDeviceId(FLAGS_device_id);
model->Read(FLAGS_xpu_model_dir);
resource->model = model;
#else
LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'.";
#endif
} else {
LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!";
}

LOG(INFO) << "Reading unit table " << FLAGS_unit_path;
Expand Down
2 changes: 2 additions & 0 deletions runtime/kunlun/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build/
fc_base/
66 changes: 66 additions & 0 deletions runtime/kunlun/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)

project(wenet VERSION 0.1)

option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF)
option(GRAPH_TOOLS "whether to build TLG graph tools" OFF)
option(BUILD_TESTING "whether to build unit test" OFF)

option(GRPC "whether to build with gRPC" OFF)
# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost
# which is a very big library
option(WEBSOCKET "whether to build with websocket" OFF)
option(XPU "whether to build with XPU" ON)

set(CMAKE_VERBOSE_MAKEFILE OFF)

include(FetchContent)
set(FETCHCONTENT_QUIET OFF)
get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_base})

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")

# Include all dependency
include(openfst)
# This CMakeLists.txt is only used for kunlun xpu, so remove the contents
# about onnx, libtorch, gpu and windows.
include(xpu)
# Compile xpu_conformer.a and conformer_test
add_subdirectory(xpu)

include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/kaldi
)

# Build all libraries
add_subdirectory(utils)
add_subdirectory(frontend)
add_subdirectory(post_processor)
add_subdirectory(kaldi) # kaldi: wfst based decoder
add_subdirectory(decoder)
add_subdirectory(api)

# Optionally, you can build with websocket
if(WEBSOCKET)
include(boost)
add_subdirectory(websocket)
endif()

# Optionally, you can build with gRPC
if(GRPC)
include(grpc)
add_subdirectory(grpc)
endif()

# Build all bins
add_subdirectory(bin)

# Unit Test
if(BUILD_TESTING)
include(gtest)
add_subdirectory(test)
endif()
83 changes: 83 additions & 0 deletions runtime/kunlun/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# 在昆仑芯片上运行Wenet
## 介绍
下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。

## 准备XPU运行环境

在开始之前,请确认您获得以下必须的环境。

XRE(XPU Runtime Environment):昆仑芯片的基础运行环境,包括芯片驱动程序、runtime api库、固件FW工具等功能模块。
XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库,提供应用程序中使用的高性能DNN功能库。

如果您需要任何帮助,或是想要进一步了解昆仑芯片,请通过官方网址联系我们:
https://www.kunlunxin.com.cn/

## 操作步骤
- 第一步:构建,需要cmake 3.14及以上版本

``` sh
export CXX=${your_g++_path}
export CC=${your_gcc_path}
export XPU_API_PATH=${your_api_path}

# -r : release version; -d : debug version
bash ./compile.sh -r
```

- 第二步:测试,测试结果将在控制台输出

``` sh
## set KUNLUN XPU visible device
export XPU_VISIBLE_DEVICES=0
export XPUSIM_DEVICE_MODEL=KUNLUN2
## set logging level
export GLOG_logtostderr=1
export GLOG_v=3
## set speech wav and model/weight path
wav_path=${your_test_wav_path}
xpu_model_dir=${your_xpu_weight_dir}
units=${your_units.txt}
## executive command
./build/bin/decoder_main \
--chunk_size -1 \
--wav_path ${wav_path} \
--xpu_model_dir ${xpu_model_di} \
--unit_path ${units} \
--device_id 0 \
--nbest 3 2>&1 | tee log.txt
```

单条语音执行结果如下所示:

``` sh
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
test 甚至出现交易几乎停滞的情况
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
```
87 changes: 87 additions & 0 deletions runtime/kunlun/README_EN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# WeNet running on KUNLUNXIN XPU device
## Introduction
The below example shows how to deploy WeNet offline and online ASR models on XPUs.
XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing.

## Setup environment for XPU device

Before the start, makesure you have these necessary environment

XRE(XPU Runtime Environment):The basic operating environment of the XPUs
includes functional modules such as chip drivers, runtime api library, and firmware tools.

XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications.

If you would like to know more about XPUs or need any help, please contact us through the official website:

https://www.kunlunxin.com.cn/

## Instruction
- Step 1. Build, the build requires cmake 3.14 or above.

``` sh
export CXX=${your_g++_path}
export CC=${your_gcc_path}
export XPU_API_PATH=${your_api_path}

# -r : release version; -d : debug version
bash ./compile.sh -r
```

- Step 2. Testing, the result is shown in the console.

``` sh
## set KUNLUN XPU visible device
export XPU_VISIBLE_DEVICES=0
export XPUSIM_DEVICE_MODEL=KUNLUN2
## set logging level
export GLOG_logtostderr=1
export GLOG_v=3
## set speech wav and model/weight/units path
wav_path=${your_test_wav_path}
xpu_model_dir=${your_xpu_weight_dir}
units=${your_units.txt}
## executive command
./build/bin/decoder_main \
--chunk_size -1 \
--wav_path $wav_path \
--xpu_model_dir $xpu_model_dir \
--unit_path $units \
--device_id 0 \
--nbest 3 2>&1 | tee log.txt
```

A typical output result is as following:

``` sh
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
test 甚至出现交易几乎停滞的情况
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
```
1 change: 1 addition & 0 deletions runtime/kunlun/api
1 change: 1 addition & 0 deletions runtime/kunlun/bin
1 change: 1 addition & 0 deletions runtime/kunlun/cmake
Loading